io_posix.cc 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #ifdef ROCKSDB_LIB_IO_POSIX
  10. #include "env/io_posix.h"
  11. #include <errno.h>
  12. #include <fcntl.h>
  13. #include <algorithm>
  14. #if defined(OS_LINUX)
  15. #include <linux/fs.h>
  16. #ifndef FALLOC_FL_KEEP_SIZE
  17. #include <linux/falloc.h>
  18. #endif
  19. #endif
  20. #include <stdio.h>
  21. #include <stdlib.h>
  22. #include <string.h>
  23. #include <sys/ioctl.h>
  24. #include <sys/mman.h>
  25. #include <sys/stat.h>
  26. #include <sys/types.h>
  27. #ifdef OS_LINUX
  28. #include <sys/statfs.h>
  29. #include <sys/syscall.h>
  30. #include <sys/sysmacros.h>
  31. #endif
  32. #include "monitoring/iostats_context_imp.h"
  33. #include "port/port.h"
  34. #include "rocksdb/slice.h"
  35. #include "test_util/sync_point.h"
  36. #include "util/autovector.h"
  37. #include "util/coding.h"
  38. #include "util/string_util.h"
  39. #if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
  40. #define F_LINUX_SPECIFIC_BASE 1024
  41. #define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
  42. #endif
  43. namespace ROCKSDB_NAMESPACE {
  44. // A wrapper for fadvise, if the platform doesn't support fadvise,
  45. // it will simply return 0.
  46. int Fadvise(int fd, off_t offset, size_t len, int advice) {
  47. #ifdef OS_LINUX
  48. return posix_fadvise(fd, offset, len, advice);
  49. #else
  50. (void)fd;
  51. (void)offset;
  52. (void)len;
  53. (void)advice;
  54. return 0; // simply do nothing.
  55. #endif
  56. }
  57. namespace {
  58. // On MacOS (and probably *BSD), the posix write and pwrite calls do not support
  59. // buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
  60. // cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
  61. // the writes aligned.
  62. bool PosixWrite(int fd, const char* buf, size_t nbyte) {
  63. const size_t kLimit1Gb = 1UL << 30;
  64. const char* src = buf;
  65. size_t left = nbyte;
  66. while (left != 0) {
  67. size_t bytes_to_write = std::min(left, kLimit1Gb);
  68. ssize_t done = write(fd, src, bytes_to_write);
  69. if (done < 0) {
  70. if (errno == EINTR) {
  71. continue;
  72. }
  73. return false;
  74. }
  75. left -= done;
  76. src += done;
  77. }
  78. return true;
  79. }
  80. bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
  81. const size_t kLimit1Gb = 1UL << 30;
  82. const char* src = buf;
  83. size_t left = nbyte;
  84. while (left != 0) {
  85. size_t bytes_to_write = std::min(left, kLimit1Gb);
  86. ssize_t done = pwrite(fd, src, bytes_to_write, offset);
  87. if (done < 0) {
  88. if (errno == EINTR) {
  89. continue;
  90. }
  91. return false;
  92. }
  93. left -= done;
  94. offset += done;
  95. src += done;
  96. }
  97. return true;
  98. }
  99. size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
  100. #ifdef OS_LINUX
  101. struct stat buf;
  102. int result = fstat(fd, &buf);
  103. if (result == -1) {
  104. return kDefaultPageSize;
  105. }
  106. if (major(buf.st_dev) == 0) {
  107. // Unnamed devices (e.g. non-device mounts), reserved as null device number.
  108. // These don't have an entry in /sys/dev/block/. Return a sensible default.
  109. return kDefaultPageSize;
  110. }
  111. // Reading queue/logical_block_size does not require special permissions.
  112. const int kBufferSize = 100;
  113. char path[kBufferSize];
  114. char real_path[PATH_MAX + 1];
  115. snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
  116. minor(buf.st_dev));
  117. if (realpath(path, real_path) == nullptr) {
  118. return kDefaultPageSize;
  119. }
  120. std::string device_dir(real_path);
  121. if (!device_dir.empty() && device_dir.back() == '/') {
  122. device_dir.pop_back();
  123. }
  124. // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
  125. // and nvme0n1 have it.
  126. // $ ls -al '/sys/dev/block/8:3'
  127. // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
  128. // ../../block/sda/sda3
  129. // $ ls -al '/sys/dev/block/259:4'
  130. // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
  131. // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
  132. size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
  133. if (parent_end == std::string::npos) {
  134. return kDefaultPageSize;
  135. }
  136. size_t parent_begin = device_dir.rfind('/', parent_end - 1);
  137. if (parent_begin == std::string::npos) {
  138. return kDefaultPageSize;
  139. }
  140. std::string parent =
  141. device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
  142. std::string child = device_dir.substr(parent_end + 1, std::string::npos);
  143. if (parent != "block" &&
  144. (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
  145. device_dir = device_dir.substr(0, parent_end);
  146. }
  147. std::string fname = device_dir + "/queue/logical_block_size";
  148. FILE* fp;
  149. size_t size = 0;
  150. fp = fopen(fname.c_str(), "r");
  151. if (fp != nullptr) {
  152. char* line = nullptr;
  153. size_t len = 0;
  154. if (getline(&line, &len, fp) != -1) {
  155. sscanf(line, "%zu", &size);
  156. }
  157. free(line);
  158. fclose(fp);
  159. }
  160. if (size != 0 && (size & (size - 1)) == 0) {
  161. return size;
  162. }
  163. #endif
  164. return kDefaultPageSize;
  165. }
  166. #ifdef ROCKSDB_RANGESYNC_PRESENT
  167. #if !defined(ZFS_SUPER_MAGIC)
  168. // The magic number for ZFS was not exposed until recently. It should be fixed
  169. // forever so we can just copy the magic number here.
  170. #define ZFS_SUPER_MAGIC 0x2fc12fc1
  171. #endif
  172. bool IsSyncFileRangeSupported(int fd) {
  173. // The approach taken in this function is to build a blacklist of cases where
  174. // we know `sync_file_range` definitely will not work properly despite passing
  175. // the compile-time check (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or
  176. // if any of the checks fail in unexpected ways, we allow `sync_file_range` to
  177. // be used. This way should minimize risk of impacting existing use cases.
  178. struct statfs buf;
  179. int ret = fstatfs(fd, &buf);
  180. assert(ret == 0);
  181. if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) {
  182. // Testing on ZFS showed the writeback did not happen asynchronously when
  183. // `sync_file_range` was called, even though it returned success. Avoid it
  184. // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
  185. // even though this'll incur extra I/O for metadata.
  186. return false;
  187. }
  188. ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */);
  189. assert(!(ret == -1 && errno != ENOSYS));
  190. if (ret == -1 && errno == ENOSYS) {
  191. // `sync_file_range` is not implemented on all platforms even if
  192. // compile-time checks pass and a supported filesystem is in-use. For
  193. // example, using ext4 on WSL (Windows Subsystem for Linux),
  194. // `sync_file_range()` returns `ENOSYS`
  195. // ("Function not implemented").
  196. return false;
  197. }
  198. // None of the cases on the blacklist matched, so allow `sync_file_range` use.
  199. return true;
  200. }
  201. #undef ZFS_SUPER_MAGIC
  202. #endif // ROCKSDB_RANGESYNC_PRESENT
  203. } // anonymous namespace
  204. /*
  205. * DirectIOHelper
  206. */
  207. #ifndef NDEBUG
  208. namespace {
  209. bool IsSectorAligned(const size_t off, size_t sector_size) {
  210. return off % sector_size == 0;
  211. }
  212. bool IsSectorAligned(const void* ptr, size_t sector_size) {
  213. return uintptr_t(ptr) % sector_size == 0;
  214. }
  215. } // namespace
  216. #endif
  217. /*
  218. * PosixSequentialFile
  219. */
  220. PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file,
  221. int fd, const EnvOptions& options)
  222. : filename_(fname),
  223. file_(file),
  224. fd_(fd),
  225. use_direct_io_(options.use_direct_reads),
  226. logical_sector_size_(GetLogicalBufferSize(fd_)) {
  227. assert(!options.use_direct_reads || !options.use_mmap_reads);
  228. }
  229. PosixSequentialFile::~PosixSequentialFile() {
  230. if (!use_direct_io()) {
  231. assert(file_);
  232. fclose(file_);
  233. } else {
  234. assert(fd_);
  235. close(fd_);
  236. }
  237. }
  238. IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
  239. Slice* result, char* scratch,
  240. IODebugContext* /*dbg*/) {
  241. assert(result != nullptr && !use_direct_io());
  242. IOStatus s;
  243. size_t r = 0;
  244. do {
  245. r = fread_unlocked(scratch, 1, n, file_);
  246. } while (r == 0 && ferror(file_) && errno == EINTR);
  247. *result = Slice(scratch, r);
  248. if (r < n) {
  249. if (feof(file_)) {
  250. // We leave status as ok if we hit the end of the file
  251. // We also clear the error so that the reads can continue
  252. // if a new data is written to the file
  253. clearerr(file_);
  254. } else {
  255. // A partial read with an error: return a non-ok status
  256. s = IOError("While reading file sequentially", filename_, errno);
  257. }
  258. }
  259. return s;
  260. }
  261. IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n,
  262. const IOOptions& /*opts*/,
  263. Slice* result, char* scratch,
  264. IODebugContext* /*dbg*/) {
  265. assert(use_direct_io());
  266. assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
  267. assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
  268. assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
  269. IOStatus s;
  270. ssize_t r = -1;
  271. size_t left = n;
  272. char* ptr = scratch;
  273. while (left > 0) {
  274. r = pread(fd_, ptr, left, static_cast<off_t>(offset));
  275. if (r <= 0) {
  276. if (r == -1 && errno == EINTR) {
  277. continue;
  278. }
  279. break;
  280. }
  281. ptr += r;
  282. offset += r;
  283. left -= r;
  284. if (r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
  285. // Bytes reads don't fill sectors. Should only happen at the end
  286. // of the file.
  287. break;
  288. }
  289. }
  290. if (r < 0) {
  291. // An error: return a non-ok status
  292. s = IOError(
  293. "While pread " + ToString(n) + " bytes from offset " + ToString(offset),
  294. filename_, errno);
  295. }
  296. *result = Slice(scratch, (r < 0) ? 0 : n - left);
  297. return s;
  298. }
  299. IOStatus PosixSequentialFile::Skip(uint64_t n) {
  300. if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
  301. return IOError("While fseek to skip " + ToString(n) + " bytes", filename_,
  302. errno);
  303. }
  304. return IOStatus::OK();
  305. }
  306. IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
  307. #ifndef OS_LINUX
  308. (void)offset;
  309. (void)length;
  310. return IOStatus::OK();
  311. #else
  312. if (!use_direct_io()) {
  313. // free OS pages
  314. int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
  315. if (ret != 0) {
  316. return IOError("While fadvise NotNeeded offset " + ToString(offset) +
  317. " len " + ToString(length),
  318. filename_, errno);
  319. }
  320. }
  321. return IOStatus::OK();
  322. #endif
  323. }
  324. /*
  325. * PosixRandomAccessFile
  326. */
  327. #if defined(OS_LINUX)
  328. size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
  329. if (max_size < kMaxVarint64Length * 3) {
  330. return 0;
  331. }
  332. struct stat buf;
  333. int result = fstat(fd, &buf);
  334. if (result == -1) {
  335. return 0;
  336. }
  337. long version = 0;
  338. result = ioctl(fd, FS_IOC_GETVERSION, &version);
  339. TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
  340. if (result == -1) {
  341. return 0;
  342. }
  343. uint64_t uversion = (uint64_t)version;
  344. char* rid = id;
  345. rid = EncodeVarint64(rid, buf.st_dev);
  346. rid = EncodeVarint64(rid, buf.st_ino);
  347. rid = EncodeVarint64(rid, uversion);
  348. assert(rid >= id);
  349. return static_cast<size_t>(rid - id);
  350. }
  351. #endif
  352. #if defined(OS_MACOSX) || defined(OS_AIX)
  353. size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
  354. if (max_size < kMaxVarint64Length * 3) {
  355. return 0;
  356. }
  357. struct stat buf;
  358. int result = fstat(fd, &buf);
  359. if (result == -1) {
  360. return 0;
  361. }
  362. char* rid = id;
  363. rid = EncodeVarint64(rid, buf.st_dev);
  364. rid = EncodeVarint64(rid, buf.st_ino);
  365. rid = EncodeVarint64(rid, buf.st_gen);
  366. assert(rid >= id);
  367. return static_cast<size_t>(rid - id);
  368. }
  369. #endif
  370. /*
  371. * PosixRandomAccessFile
  372. *
  373. * pread() based random-access
  374. */
  375. PosixRandomAccessFile::PosixRandomAccessFile(
  376. const std::string& fname, int fd, const EnvOptions& options
  377. #if defined(ROCKSDB_IOURING_PRESENT)
  378. ,
  379. ThreadLocalPtr* thread_local_io_urings
  380. #endif
  381. )
  382. : filename_(fname),
  383. fd_(fd),
  384. use_direct_io_(options.use_direct_reads),
  385. logical_sector_size_(GetLogicalBufferSize(fd_))
  386. #if defined(ROCKSDB_IOURING_PRESENT)
  387. ,
  388. thread_local_io_urings_(thread_local_io_urings)
  389. #endif
  390. {
  391. assert(!options.use_direct_reads || !options.use_mmap_reads);
  392. assert(!options.use_mmap_reads || sizeof(void*) < 8);
  393. }
  394. PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
  395. IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
  396. const IOOptions& /*opts*/, Slice* result,
  397. char* scratch,
  398. IODebugContext* /*dbg*/) const {
  399. if (use_direct_io()) {
  400. assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
  401. assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
  402. assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
  403. }
  404. IOStatus s;
  405. ssize_t r = -1;
  406. size_t left = n;
  407. char* ptr = scratch;
  408. while (left > 0) {
  409. r = pread(fd_, ptr, left, static_cast<off_t>(offset));
  410. if (r <= 0) {
  411. if (r == -1 && errno == EINTR) {
  412. continue;
  413. }
  414. break;
  415. }
  416. ptr += r;
  417. offset += r;
  418. left -= r;
  419. if (use_direct_io() &&
  420. r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
  421. // Bytes reads don't fill sectors. Should only happen at the end
  422. // of the file.
  423. break;
  424. }
  425. }
  426. if (r < 0) {
  427. // An error: return a non-ok status
  428. s = IOError(
  429. "While pread offset " + ToString(offset) + " len " + ToString(n),
  430. filename_, errno);
  431. }
  432. *result = Slice(scratch, (r < 0) ? 0 : n - left);
  433. return s;
  434. }
  435. IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
  436. size_t num_reqs,
  437. const IOOptions& options,
  438. IODebugContext* dbg) {
  439. #if defined(ROCKSDB_IOURING_PRESENT)
  440. struct io_uring* iu = nullptr;
  441. if (thread_local_io_urings_) {
  442. iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
  443. if (iu == nullptr) {
  444. iu = CreateIOUring();
  445. if (iu != nullptr) {
  446. thread_local_io_urings_->Reset(iu);
  447. }
  448. }
  449. }
  450. // Init failed, platform doesn't support io_uring. Fall back to
  451. // serialized reads
  452. if (iu == nullptr) {
  453. return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
  454. }
  455. struct WrappedReadRequest {
  456. FSReadRequest* req;
  457. struct iovec iov;
  458. size_t finished_len;
  459. explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {}
  460. };
  461. autovector<WrappedReadRequest, 32> req_wraps;
  462. autovector<WrappedReadRequest*, 4> incomplete_rq_list;
  463. for (size_t i = 0; i < num_reqs; i++) {
  464. req_wraps.emplace_back(&reqs[i]);
  465. }
  466. size_t reqs_off = 0;
  467. while (num_reqs > reqs_off || !incomplete_rq_list.empty()) {
  468. size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size();
  469. // If requests exceed depth, split it into batches
  470. if (this_reqs > kIoUringDepth) this_reqs = kIoUringDepth;
  471. assert(incomplete_rq_list.size() <= this_reqs);
  472. for (size_t i = 0; i < this_reqs; i++) {
  473. WrappedReadRequest* rep_to_submit;
  474. if (i < incomplete_rq_list.size()) {
  475. rep_to_submit = incomplete_rq_list[i];
  476. } else {
  477. rep_to_submit = &req_wraps[reqs_off++];
  478. }
  479. assert(rep_to_submit->req->len > rep_to_submit->finished_len);
  480. rep_to_submit->iov.iov_base =
  481. rep_to_submit->req->scratch + rep_to_submit->finished_len;
  482. rep_to_submit->iov.iov_len =
  483. rep_to_submit->req->len - rep_to_submit->finished_len;
  484. struct io_uring_sqe* sqe;
  485. sqe = io_uring_get_sqe(iu);
  486. io_uring_prep_readv(
  487. sqe, fd_, &rep_to_submit->iov, 1,
  488. rep_to_submit->req->offset + rep_to_submit->finished_len);
  489. io_uring_sqe_set_data(sqe, rep_to_submit);
  490. }
  491. incomplete_rq_list.clear();
  492. ssize_t ret =
  493. io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
  494. if (static_cast<size_t>(ret) != this_reqs) {
  495. fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
  496. }
  497. assert(static_cast<size_t>(ret) == this_reqs);
  498. for (size_t i = 0; i < this_reqs; i++) {
  499. struct io_uring_cqe* cqe;
  500. WrappedReadRequest* req_wrap;
  501. // We could use the peek variant here, but this seems safer in terms
  502. // of our initial wait not reaping all completions
  503. ret = io_uring_wait_cqe(iu, &cqe);
  504. assert(!ret);
  505. req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
  506. FSReadRequest* req = req_wrap->req;
  507. if (cqe->res < 0) {
  508. req->result = Slice(req->scratch, 0);
  509. req->status = IOError("Req failed", filename_, cqe->res);
  510. } else {
  511. size_t bytes_read = static_cast<size_t>(cqe->res);
  512. TEST_SYNC_POINT_CALLBACK(
  513. "PosixRandomAccessFile::MultiRead:io_uring_result", &bytes_read);
  514. if (bytes_read == req_wrap->iov.iov_len) {
  515. req->result = Slice(req->scratch, req->len);
  516. req->status = IOStatus::OK();
  517. } else if (bytes_read == 0) {
  518. // cqe->res == 0 can means EOF, or can mean partial results. See
  519. // comment
  520. // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435
  521. // Fall back to pread in this case.
  522. Slice tmp_slice;
  523. req->status =
  524. Read(req->offset + req_wrap->finished_len,
  525. req->len - req_wrap->finished_len, options, &tmp_slice,
  526. req->scratch + req_wrap->finished_len, dbg);
  527. req->result =
  528. Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
  529. } else if (bytes_read < req_wrap->iov.iov_len) {
  530. assert(bytes_read > 0);
  531. assert(bytes_read + req_wrap->finished_len < req->len);
  532. req_wrap->finished_len += bytes_read;
  533. incomplete_rq_list.push_back(req_wrap);
  534. } else {
  535. req->result = Slice(req->scratch, 0);
  536. req->status = IOError("Req returned more bytes than requested",
  537. filename_, cqe->res);
  538. }
  539. }
  540. io_uring_cqe_seen(iu, cqe);
  541. }
  542. }
  543. return IOStatus::OK();
  544. #else
  545. return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
  546. #endif
  547. }
  548. IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n,
  549. const IOOptions& /*opts*/,
  550. IODebugContext* /*dbg*/) {
  551. IOStatus s;
  552. if (!use_direct_io()) {
  553. ssize_t r = 0;
  554. #ifdef OS_LINUX
  555. r = readahead(fd_, offset, n);
  556. #endif
  557. #ifdef OS_MACOSX
  558. radvisory advice;
  559. advice.ra_offset = static_cast<off_t>(offset);
  560. advice.ra_count = static_cast<int>(n);
  561. r = fcntl(fd_, F_RDADVISE, &advice);
  562. #endif
  563. if (r == -1) {
  564. s = IOError("While prefetching offset " + ToString(offset) + " len " +
  565. ToString(n),
  566. filename_, errno);
  567. }
  568. }
  569. return s;
  570. }
  571. #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
  572. size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
  573. return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
  574. }
  575. #endif
  576. void PosixRandomAccessFile::Hint(AccessPattern pattern) {
  577. if (use_direct_io()) {
  578. return;
  579. }
  580. switch (pattern) {
  581. case kNormal:
  582. Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
  583. break;
  584. case kRandom:
  585. Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
  586. break;
  587. case kSequential:
  588. Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
  589. break;
  590. case kWillNeed:
  591. Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
  592. break;
  593. case kWontNeed:
  594. Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
  595. break;
  596. default:
  597. assert(false);
  598. break;
  599. }
  600. }
  601. IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
  602. if (use_direct_io()) {
  603. return IOStatus::OK();
  604. }
  605. #ifndef OS_LINUX
  606. (void)offset;
  607. (void)length;
  608. return IOStatus::OK();
  609. #else
  610. // free OS pages
  611. int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
  612. if (ret == 0) {
  613. return IOStatus::OK();
  614. }
  615. return IOError("While fadvise NotNeeded offset " + ToString(offset) +
  616. " len " + ToString(length),
  617. filename_, errno);
  618. #endif
  619. }
  620. /*
  621. * PosixMmapReadableFile
  622. *
  623. * mmap() based random-access
  624. */
  625. // base[0,length-1] contains the mmapped contents of the file.
  626. PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
  627. const std::string& fname,
  628. void* base, size_t length,
  629. const EnvOptions& options)
  630. : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
  631. #ifdef NDEBUG
  632. (void)options;
  633. #endif
  634. fd_ = fd_ + 0; // suppress the warning for used variables
  635. assert(options.use_mmap_reads);
  636. assert(!options.use_direct_reads);
  637. }
  638. PosixMmapReadableFile::~PosixMmapReadableFile() {
  639. int ret = munmap(mmapped_region_, length_);
  640. if (ret != 0) {
  641. fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
  642. mmapped_region_, length_);
  643. }
  644. close(fd_);
  645. }
  646. IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n,
  647. const IOOptions& /*opts*/, Slice* result,
  648. char* /*scratch*/,
  649. IODebugContext* /*dbg*/) const {
  650. IOStatus s;
  651. if (offset > length_) {
  652. *result = Slice();
  653. return IOError("While mmap read offset " + ToString(offset) +
  654. " larger than file length " + ToString(length_),
  655. filename_, EINVAL);
  656. } else if (offset + n > length_) {
  657. n = static_cast<size_t>(length_ - offset);
  658. }
  659. *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
  660. return s;
  661. }
  662. IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
  663. #ifndef OS_LINUX
  664. (void)offset;
  665. (void)length;
  666. return IOStatus::OK();
  667. #else
  668. // free OS pages
  669. int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
  670. if (ret == 0) {
  671. return IOStatus::OK();
  672. }
  673. return IOError("While fadvise not needed. Offset " + ToString(offset) +
  674. " len" + ToString(length),
  675. filename_, errno);
  676. #endif
  677. }
  678. /*
  679. * PosixMmapFile
  680. *
  681. * We preallocate up to an extra megabyte and use memcpy to append new
  682. * data to the file. This is safe since we either properly close the
  683. * file before reading from it, or for log files, the reading code
  684. * knows enough to skip zero suffixes.
  685. */
  686. IOStatus PosixMmapFile::UnmapCurrentRegion() {
  687. TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
  688. if (base_ != nullptr) {
  689. int munmap_status = munmap(base_, limit_ - base_);
  690. if (munmap_status != 0) {
  691. return IOError("While munmap", filename_, munmap_status);
  692. }
  693. file_offset_ += limit_ - base_;
  694. base_ = nullptr;
  695. limit_ = nullptr;
  696. last_sync_ = nullptr;
  697. dst_ = nullptr;
  698. // Increase the amount we map the next time, but capped at 1MB
  699. if (map_size_ < (1 << 20)) {
  700. map_size_ *= 2;
  701. }
  702. }
  703. return IOStatus::OK();
  704. }
  705. IOStatus PosixMmapFile::MapNewRegion() {
  706. #ifdef ROCKSDB_FALLOCATE_PRESENT
  707. assert(base_ == nullptr);
  708. TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
  709. // we can't fallocate with FALLOC_FL_KEEP_SIZE here
  710. if (allow_fallocate_) {
  711. IOSTATS_TIMER_GUARD(allocate_nanos);
  712. int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
  713. if (alloc_status != 0) {
  714. // fallback to posix_fallocate
  715. alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
  716. }
  717. if (alloc_status != 0) {
  718. return IOStatus::IOError("Error allocating space to file : " + filename_ +
  719. "Error : " + strerror(alloc_status));
  720. }
  721. }
  722. TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds);
  723. void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
  724. file_offset_);
  725. if (ptr == MAP_FAILED) {
  726. return IOStatus::IOError("MMap failed on " + filename_);
  727. }
  728. TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds);
  729. base_ = reinterpret_cast<char*>(ptr);
  730. limit_ = base_ + map_size_;
  731. dst_ = base_;
  732. last_sync_ = base_;
  733. return IOStatus::OK();
  734. #else
  735. return IOStatus::NotSupported("This platform doesn't support fallocate()");
  736. #endif
  737. }
  738. IOStatus PosixMmapFile::Msync() {
  739. if (dst_ == last_sync_) {
  740. return IOStatus::OK();
  741. }
  742. // Find the beginnings of the pages that contain the first and last
  743. // bytes to be synced.
  744. size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
  745. size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
  746. last_sync_ = dst_;
  747. TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
  748. if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
  749. return IOError("While msync", filename_, errno);
  750. }
  751. return IOStatus::OK();
  752. }
  753. PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
  754. const EnvOptions& options)
  755. : filename_(fname),
  756. fd_(fd),
  757. page_size_(page_size),
  758. map_size_(Roundup(65536, page_size)),
  759. base_(nullptr),
  760. limit_(nullptr),
  761. dst_(nullptr),
  762. last_sync_(nullptr),
  763. file_offset_(0) {
  764. #ifdef ROCKSDB_FALLOCATE_PRESENT
  765. allow_fallocate_ = options.allow_fallocate;
  766. fallocate_with_keep_size_ = options.fallocate_with_keep_size;
  767. #else
  768. (void)options;
  769. #endif
  770. assert((page_size & (page_size - 1)) == 0);
  771. assert(options.use_mmap_writes);
  772. assert(!options.use_direct_writes);
  773. }
  774. PosixMmapFile::~PosixMmapFile() {
  775. if (fd_ >= 0) {
  776. PosixMmapFile::Close(IOOptions(), nullptr);
  777. }
  778. }
  779. IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/,
  780. IODebugContext* /*dbg*/) {
  781. const char* src = data.data();
  782. size_t left = data.size();
  783. while (left > 0) {
  784. assert(base_ <= dst_);
  785. assert(dst_ <= limit_);
  786. size_t avail = limit_ - dst_;
  787. if (avail == 0) {
  788. IOStatus s = UnmapCurrentRegion();
  789. if (!s.ok()) {
  790. return s;
  791. }
  792. s = MapNewRegion();
  793. if (!s.ok()) {
  794. return s;
  795. }
  796. TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds);
  797. }
  798. size_t n = (left <= avail) ? left : avail;
  799. assert(dst_);
  800. memcpy(dst_, src, n);
  801. dst_ += n;
  802. src += n;
  803. left -= n;
  804. }
  805. return IOStatus::OK();
  806. }
  807. IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/,
  808. IODebugContext* /*dbg*/) {
  809. IOStatus s;
  810. size_t unused = limit_ - dst_;
  811. s = UnmapCurrentRegion();
  812. if (!s.ok()) {
  813. s = IOError("While closing mmapped file", filename_, errno);
  814. } else if (unused > 0) {
  815. // Trim the extra space at the end of the file
  816. if (ftruncate(fd_, file_offset_ - unused) < 0) {
  817. s = IOError("While ftruncating mmaped file", filename_, errno);
  818. }
  819. }
  820. if (close(fd_) < 0) {
  821. if (s.ok()) {
  822. s = IOError("While closing mmapped file", filename_, errno);
  823. }
  824. }
  825. fd_ = -1;
  826. base_ = nullptr;
  827. limit_ = nullptr;
  828. return s;
  829. }
  830. IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/,
  831. IODebugContext* /*dbg*/) {
  832. return IOStatus::OK();
  833. }
  834. IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/,
  835. IODebugContext* /*dbg*/) {
  836. if (fdatasync(fd_) < 0) {
  837. return IOError("While fdatasync mmapped file", filename_, errno);
  838. }
  839. return Msync();
  840. }
  841. /**
  842. * Flush data as well as metadata to stable storage.
  843. */
  844. IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/,
  845. IODebugContext* /*dbg*/) {
  846. if (fsync(fd_) < 0) {
  847. return IOError("While fsync mmaped file", filename_, errno);
  848. }
  849. return Msync();
  850. }
  851. /**
  852. * Get the size of valid data in the file. This will not match the
  853. * size that is returned from the filesystem because we use mmap
  854. * to extend file by map_size every time.
  855. */
  856. uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/,
  857. IODebugContext* /*dbg*/) {
  858. size_t used = dst_ - base_;
  859. return file_offset_ + used;
  860. }
  861. IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
  862. #ifndef OS_LINUX
  863. (void)offset;
  864. (void)length;
  865. return IOStatus::OK();
  866. #else
  867. // free OS pages
  868. int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
  869. if (ret == 0) {
  870. return IOStatus::OK();
  871. }
  872. return IOError("While fadvise NotNeeded mmapped file", filename_, errno);
  873. #endif
  874. }
  875. #ifdef ROCKSDB_FALLOCATE_PRESENT
  876. IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len,
  877. const IOOptions& /*opts*/,
  878. IODebugContext* /*dbg*/) {
  879. assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  880. assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  881. TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
  882. int alloc_status = 0;
  883. if (allow_fallocate_) {
  884. alloc_status =
  885. fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
  886. static_cast<off_t>(offset), static_cast<off_t>(len));
  887. }
  888. if (alloc_status == 0) {
  889. return IOStatus::OK();
  890. } else {
  891. return IOError(
  892. "While fallocate offset " + ToString(offset) + " len " + ToString(len),
  893. filename_, errno);
  894. }
  895. }
  896. #endif
  897. /*
  898. * PosixWritableFile
  899. *
  900. * Use posix write to write data to a file.
  901. */
  902. PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
  903. const EnvOptions& options)
  904. : FSWritableFile(options),
  905. filename_(fname),
  906. use_direct_io_(options.use_direct_writes),
  907. fd_(fd),
  908. filesize_(0),
  909. logical_sector_size_(GetLogicalBufferSize(fd_)) {
  910. #ifdef ROCKSDB_FALLOCATE_PRESENT
  911. allow_fallocate_ = options.allow_fallocate;
  912. fallocate_with_keep_size_ = options.fallocate_with_keep_size;
  913. #endif
  914. #ifdef ROCKSDB_RANGESYNC_PRESENT
  915. sync_file_range_supported_ = IsSyncFileRangeSupported(fd_);
  916. #endif // ROCKSDB_RANGESYNC_PRESENT
  917. assert(!options.use_mmap_writes);
  918. }
  919. PosixWritableFile::~PosixWritableFile() {
  920. if (fd_ >= 0) {
  921. PosixWritableFile::Close(IOOptions(), nullptr);
  922. }
  923. }
  924. IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/,
  925. IODebugContext* /*dbg*/) {
  926. if (use_direct_io()) {
  927. assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
  928. assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
  929. }
  930. const char* src = data.data();
  931. size_t nbytes = data.size();
  932. if (!PosixWrite(fd_, src, nbytes)) {
  933. return IOError("While appending to file", filename_, errno);
  934. }
  935. filesize_ += nbytes;
  936. return IOStatus::OK();
  937. }
  938. IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
  939. const IOOptions& /*opts*/,
  940. IODebugContext* /*dbg*/) {
  941. if (use_direct_io()) {
  942. assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
  943. assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
  944. assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
  945. }
  946. assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  947. const char* src = data.data();
  948. size_t nbytes = data.size();
  949. if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
  950. return IOError("While pwrite to file at offset " + ToString(offset),
  951. filename_, errno);
  952. }
  953. filesize_ = offset + nbytes;
  954. return IOStatus::OK();
  955. }
  956. IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/,
  957. IODebugContext* /*dbg*/) {
  958. IOStatus s;
  959. int r = ftruncate(fd_, size);
  960. if (r < 0) {
  961. s = IOError("While ftruncate file to size " + ToString(size), filename_,
  962. errno);
  963. } else {
  964. filesize_ = size;
  965. }
  966. return s;
  967. }
  968. IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/,
  969. IODebugContext* /*dbg*/) {
  970. IOStatus s;
  971. size_t block_size;
  972. size_t last_allocated_block;
  973. GetPreallocationStatus(&block_size, &last_allocated_block);
  974. if (last_allocated_block > 0) {
  975. // trim the extra space preallocated at the end of the file
  976. // NOTE(ljin): we probably don't want to surface failure as an IOError,
  977. // but it will be nice to log these errors.
  978. int dummy __attribute__((__unused__));
  979. dummy = ftruncate(fd_, filesize_);
  980. #if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE) && \
  981. !defined(TRAVIS)
  982. // in some file systems, ftruncate only trims trailing space if the
  983. // new file size is smaller than the current size. Calling fallocate
  984. // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
  985. // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
  986. // filesystems:
  987. // XFS (since Linux 2.6.38)
  988. // ext4 (since Linux 3.0)
  989. // Btrfs (since Linux 3.7)
  990. // tmpfs (since Linux 3.5)
  991. // We ignore error since failure of this operation does not affect
  992. // correctness.
  993. // TRAVIS - this code does not work on TRAVIS filesystems.
  994. // the FALLOC_FL_KEEP_SIZE option is expected to not change the size
  995. // of the file, but it does. Simple strace report will show that.
  996. // While we work with Travis-CI team to figure out if this is a
  997. // quirk of Docker/AUFS, we will comment this out.
  998. struct stat file_stats;
  999. int result = fstat(fd_, &file_stats);
  1000. // After ftruncate, we check whether ftruncate has the correct behavior.
  1001. // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
  1002. if (result == 0 &&
  1003. (file_stats.st_size + file_stats.st_blksize - 1) /
  1004. file_stats.st_blksize !=
  1005. file_stats.st_blocks / (file_stats.st_blksize / 512)) {
  1006. IOSTATS_TIMER_GUARD(allocate_nanos);
  1007. if (allow_fallocate_) {
  1008. fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
  1009. block_size * last_allocated_block - filesize_);
  1010. }
  1011. }
  1012. #endif
  1013. }
  1014. if (close(fd_) < 0) {
  1015. s = IOError("While closing file after writing", filename_, errno);
  1016. }
  1017. fd_ = -1;
  1018. return s;
  1019. }
  1020. // write out the cached data to the OS cache
  1021. IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/,
  1022. IODebugContext* /*dbg*/) {
  1023. return IOStatus::OK();
  1024. }
  1025. IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/,
  1026. IODebugContext* /*dbg*/) {
  1027. if (fdatasync(fd_) < 0) {
  1028. return IOError("While fdatasync", filename_, errno);
  1029. }
  1030. return IOStatus::OK();
  1031. }
  1032. IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/,
  1033. IODebugContext* /*dbg*/) {
  1034. if (fsync(fd_) < 0) {
  1035. return IOError("While fsync", filename_, errno);
  1036. }
  1037. return IOStatus::OK();
  1038. }
  1039. bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
  1040. uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/,
  1041. IODebugContext* /*dbg*/) {
  1042. return filesize_;
  1043. }
  1044. void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
  1045. #ifdef OS_LINUX
  1046. // Suppress Valgrind "Unimplemented functionality" error.
  1047. #ifndef ROCKSDB_VALGRIND_RUN
  1048. if (hint == write_hint_) {
  1049. return;
  1050. }
  1051. if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) {
  1052. write_hint_ = hint;
  1053. }
  1054. #else
  1055. (void)hint;
  1056. #endif // ROCKSDB_VALGRIND_RUN
  1057. #else
  1058. (void)hint;
  1059. #endif // OS_LINUX
  1060. }
  1061. IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
  1062. if (use_direct_io()) {
  1063. return IOStatus::OK();
  1064. }
  1065. #ifndef OS_LINUX
  1066. (void)offset;
  1067. (void)length;
  1068. return IOStatus::OK();
  1069. #else
  1070. // free OS pages
  1071. int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
  1072. if (ret == 0) {
  1073. return IOStatus::OK();
  1074. }
  1075. return IOError("While fadvise NotNeeded", filename_, errno);
  1076. #endif
  1077. }
  1078. #ifdef ROCKSDB_FALLOCATE_PRESENT
  1079. IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len,
  1080. const IOOptions& /*opts*/,
  1081. IODebugContext* /*dbg*/) {
  1082. assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  1083. assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  1084. TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
  1085. IOSTATS_TIMER_GUARD(allocate_nanos);
  1086. int alloc_status = 0;
  1087. if (allow_fallocate_) {
  1088. alloc_status =
  1089. fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
  1090. static_cast<off_t>(offset), static_cast<off_t>(len));
  1091. }
  1092. if (alloc_status == 0) {
  1093. return IOStatus::OK();
  1094. } else {
  1095. return IOError(
  1096. "While fallocate offset " + ToString(offset) + " len " + ToString(len),
  1097. filename_, errno);
  1098. }
  1099. }
  1100. #endif
  1101. IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
  1102. const IOOptions& opts,
  1103. IODebugContext* dbg) {
  1104. #ifdef ROCKSDB_RANGESYNC_PRESENT
  1105. assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  1106. assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  1107. if (sync_file_range_supported_) {
  1108. int ret;
  1109. if (strict_bytes_per_sync_) {
  1110. // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length
  1111. // that spans all bytes written so far tells `sync_file_range` to wait for
  1112. // any outstanding writeback requests to finish before issuing a new one.
  1113. ret =
  1114. sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes),
  1115. SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE);
  1116. } else {
  1117. ret = sync_file_range(fd_, static_cast<off_t>(offset),
  1118. static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE);
  1119. }
  1120. if (ret != 0) {
  1121. return IOError("While sync_file_range returned " + ToString(ret),
  1122. filename_, errno);
  1123. }
  1124. return IOStatus::OK();
  1125. }
  1126. #endif // ROCKSDB_RANGESYNC_PRESENT
  1127. return FSWritableFile::RangeSync(offset, nbytes, opts, dbg);
  1128. }
  1129. #ifdef OS_LINUX
  1130. size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
  1131. return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
  1132. }
  1133. #endif
  1134. /*
  1135. * PosixRandomRWFile
  1136. */
  1137. PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd,
  1138. const EnvOptions& /*options*/)
  1139. : filename_(fname), fd_(fd) {}
  1140. PosixRandomRWFile::~PosixRandomRWFile() {
  1141. if (fd_ >= 0) {
  1142. Close(IOOptions(), nullptr);
  1143. }
  1144. }
  1145. IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data,
  1146. const IOOptions& /*opts*/,
  1147. IODebugContext* /*dbg*/) {
  1148. const char* src = data.data();
  1149. size_t nbytes = data.size();
  1150. if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
  1151. return IOError(
  1152. "While write random read/write file at offset " + ToString(offset),
  1153. filename_, errno);
  1154. }
  1155. return IOStatus::OK();
  1156. }
  1157. IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n,
  1158. const IOOptions& /*opts*/, Slice* result,
  1159. char* scratch, IODebugContext* /*dbg*/) const {
  1160. size_t left = n;
  1161. char* ptr = scratch;
  1162. while (left > 0) {
  1163. ssize_t done = pread(fd_, ptr, left, offset);
  1164. if (done < 0) {
  1165. // error while reading from file
  1166. if (errno == EINTR) {
  1167. // read was interrupted, try again.
  1168. continue;
  1169. }
  1170. return IOError("While reading random read/write file offset " +
  1171. ToString(offset) + " len " + ToString(n),
  1172. filename_, errno);
  1173. } else if (done == 0) {
  1174. // Nothing more to read
  1175. break;
  1176. }
  1177. // Read `done` bytes
  1178. ptr += done;
  1179. offset += done;
  1180. left -= done;
  1181. }
  1182. *result = Slice(scratch, n - left);
  1183. return IOStatus::OK();
  1184. }
  1185. IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/,
  1186. IODebugContext* /*dbg*/) {
  1187. return IOStatus::OK();
  1188. }
  1189. IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/,
  1190. IODebugContext* /*dbg*/) {
  1191. if (fdatasync(fd_) < 0) {
  1192. return IOError("While fdatasync random read/write file", filename_, errno);
  1193. }
  1194. return IOStatus::OK();
  1195. }
  1196. IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/,
  1197. IODebugContext* /*dbg*/) {
  1198. if (fsync(fd_) < 0) {
  1199. return IOError("While fsync random read/write file", filename_, errno);
  1200. }
  1201. return IOStatus::OK();
  1202. }
  1203. IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/,
  1204. IODebugContext* /*dbg*/) {
  1205. if (close(fd_) < 0) {
  1206. return IOError("While close random read/write file", filename_, errno);
  1207. }
  1208. fd_ = -1;
  1209. return IOStatus::OK();
  1210. }
  1211. PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
  1212. // TODO should have error handling though not much we can do...
  1213. munmap(this->base_, length_);
  1214. }
  1215. /*
  1216. * PosixDirectory
  1217. */
  1218. PosixDirectory::~PosixDirectory() { close(fd_); }
  1219. IOStatus PosixDirectory::Fsync(const IOOptions& /*opts*/,
  1220. IODebugContext* /*dbg*/) {
  1221. #ifndef OS_AIX
  1222. if (fsync(fd_) == -1) {
  1223. return IOError("While fsync", "a directory", errno);
  1224. }
  1225. #endif
  1226. return IOStatus::OK();
  1227. }
  1228. } // namespace ROCKSDB_NAMESPACE
  1229. #endif