fs_posix.cc 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors
  9. #if !defined(OS_WIN)
  10. #include <dirent.h>
  11. #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
  12. #include <dlfcn.h>
  13. #endif
  14. #include <fcntl.h>
  15. #include <pthread.h>
  16. #include <sys/ioctl.h>
  17. #include <sys/mman.h>
  18. #include <sys/stat.h>
  19. #include <cerrno>
  20. #include <csignal>
  21. #include <cstdio>
  22. #include <cstdlib>
  23. #include <cstring>
  24. #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
  25. #include <sys/statfs.h>
  26. #include <sys/sysmacros.h>
  27. #endif
  28. #include <sys/statvfs.h>
  29. #include <sys/time.h>
  30. #include <sys/types.h>
  31. #include <algorithm>
  32. #include <ctime>
  33. // Get nano time includes
  34. #if defined(OS_LINUX) || defined(OS_FREEBSD)
  35. #elif defined(__MACH__)
  36. #include <Availability.h>
  37. #include <mach/clock.h>
  38. #include <mach/mach.h>
  39. #else
  40. #include <chrono>
  41. #endif
  42. #include <deque>
  43. #include <set>
  44. #include <vector>
  45. #include "env/composite_env_wrapper.h"
  46. #include "env/io_posix.h"
  47. #include "monitoring/iostats_context_imp.h"
  48. #include "monitoring/thread_status_updater.h"
  49. #include "options/db_options.h"
  50. #include "port/lang.h"
  51. #include "port/port.h"
  52. #include "rocksdb/options.h"
  53. #include "rocksdb/slice.h"
  54. #include "rocksdb/utilities/object_registry.h"
  55. #include "test_util/sync_point.h"
  56. #include "util/coding.h"
  57. #include "util/compression_context_cache.h"
  58. #include "util/random.h"
  59. #include "util/string_util.h"
  60. #include "util/thread_local.h"
  61. #include "util/threadpool_imp.h"
  62. #if !defined(TMPFS_MAGIC)
  63. #define TMPFS_MAGIC 0x01021994
  64. #endif
  65. #if !defined(XFS_SUPER_MAGIC)
  66. #define XFS_SUPER_MAGIC 0x58465342
  67. #endif
  68. #if !defined(EXT4_SUPER_MAGIC)
  69. #define EXT4_SUPER_MAGIC 0xEF53
  70. #endif
  71. extern "C" bool RocksDbIOUringEnable() __attribute__((__weak__));
  72. namespace ROCKSDB_NAMESPACE {
  73. namespace {
  74. inline mode_t GetDBFileMode(bool allow_non_owner_access) {
  75. return allow_non_owner_access ? 0644 : 0600;
  76. }
  77. // list of pathnames that are locked
  78. // Only used for error message.
  79. struct LockHoldingInfo {
  80. int64_t acquire_time;
  81. uint64_t acquiring_thread;
  82. };
  83. static std::map<std::string, LockHoldingInfo> locked_files;
  84. static port::Mutex mutex_locked_files;
  85. static int LockOrUnlock(int fd, bool lock) {
  86. errno = 0;
  87. struct flock f;
  88. memset(&f, 0, sizeof(f));
  89. f.l_type = (lock ? F_WRLCK : F_UNLCK);
  90. f.l_whence = SEEK_SET;
  91. f.l_start = 0;
  92. f.l_len = 0; // Lock/unlock entire file
  93. int value = fcntl(fd, F_SETLK, &f);
  94. return value;
  95. }
  96. class PosixFileLock : public FileLock {
  97. public:
  98. int fd_ = /*invalid*/ -1;
  99. std::string filename;
  100. void Clear() {
  101. fd_ = -1;
  102. filename.clear();
  103. }
  104. ~PosixFileLock() override {
  105. // Check for destruction without UnlockFile
  106. assert(fd_ == -1);
  107. }
  108. };
  109. int cloexec_flags(int flags, const EnvOptions* options) {
  110. // If the system supports opening the file with cloexec enabled,
  111. // do so, as this avoids a race condition if a db is opened around
  112. // the same time that a child process is forked
  113. #ifdef O_CLOEXEC
  114. if (options == nullptr || options->set_fd_cloexec) {
  115. flags |= O_CLOEXEC;
  116. }
  117. #else
  118. (void)options;
  119. #endif
  120. return flags;
  121. }
  122. class PosixFileSystem : public FileSystem {
  123. public:
  124. PosixFileSystem();
  125. static const char* kClassName() { return "PosixFileSystem"; }
  126. const char* Name() const override { return kClassName(); }
  127. const char* NickName() const override { return kDefaultName(); }
  128. ~PosixFileSystem() override = default;
  129. bool IsInstanceOf(const std::string& name) const override {
  130. if (name == "posix") {
  131. return true;
  132. } else {
  133. return FileSystem::IsInstanceOf(name);
  134. }
  135. }
  136. void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
  137. if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
  138. fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
  139. }
  140. }
  141. IOStatus NewSequentialFile(const std::string& fname,
  142. const FileOptions& options,
  143. std::unique_ptr<FSSequentialFile>* result,
  144. IODebugContext* /*dbg*/) override {
  145. result->reset();
  146. int fd = -1;
  147. int flags = cloexec_flags(O_RDONLY, &options);
  148. FILE* file = nullptr;
  149. if (options.use_direct_reads && !options.use_mmap_reads) {
  150. #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
  151. flags |= O_DIRECT;
  152. TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags);
  153. #endif
  154. }
  155. do {
  156. IOSTATS_TIMER_GUARD(open_nanos);
  157. fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
  158. } while (fd < 0 && errno == EINTR);
  159. if (fd < 0) {
  160. return IOError("While opening a file for sequentially reading", fname,
  161. errno);
  162. }
  163. SetFD_CLOEXEC(fd, &options);
  164. if (options.use_direct_reads && !options.use_mmap_reads) {
  165. #ifdef OS_MACOSX
  166. if (fcntl(fd, F_NOCACHE, 1) == -1) {
  167. close(fd);
  168. return IOError("While fcntl NoCache", fname, errno);
  169. }
  170. #endif
  171. } else {
  172. do {
  173. IOSTATS_TIMER_GUARD(open_nanos);
  174. file = fdopen(fd, "r");
  175. } while (file == nullptr && errno == EINTR);
  176. if (file == nullptr) {
  177. close(fd);
  178. return IOError("While opening file for sequentially read", fname,
  179. errno);
  180. }
  181. }
  182. result->reset(new PosixSequentialFile(
  183. fname, file, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd),
  184. options));
  185. return IOStatus::OK();
  186. }
  187. IOStatus NewRandomAccessFile(const std::string& fname,
  188. const FileOptions& options,
  189. std::unique_ptr<FSRandomAccessFile>* result,
  190. IODebugContext* /*dbg*/) override {
  191. result->reset();
  192. IOStatus s = IOStatus::OK();
  193. int fd;
  194. int flags = cloexec_flags(O_RDONLY, &options);
  195. if (options.use_direct_reads && !options.use_mmap_reads) {
  196. #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
  197. flags |= O_DIRECT;
  198. TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags);
  199. #endif
  200. }
  201. do {
  202. IOSTATS_TIMER_GUARD(open_nanos);
  203. fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
  204. } while (fd < 0 && errno == EINTR);
  205. if (fd < 0) {
  206. s = IOError("While open a file for random read", fname, errno);
  207. return s;
  208. }
  209. SetFD_CLOEXEC(fd, &options);
  210. if (options.use_mmap_reads) {
  211. // Use of mmap for random reads has been removed because it
  212. // kills performance when storage is fast.
  213. // Use mmap when virtual address-space is plentiful.
  214. uint64_t size;
  215. IOOptions opts;
  216. s = GetFileSizeOnOpenedFile(fd, fname, &size);
  217. if (s.ok()) {
  218. void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
  219. if (base != MAP_FAILED) {
  220. result->reset(
  221. new PosixMmapReadableFile(fd, fname, base, size, options));
  222. } else {
  223. s = IOError("while mmap file for read", fname, errno);
  224. close(fd);
  225. }
  226. } else {
  227. close(fd);
  228. }
  229. } else {
  230. if (options.use_direct_reads && !options.use_mmap_reads) {
  231. #ifdef OS_MACOSX
  232. if (fcntl(fd, F_NOCACHE, 1) == -1) {
  233. close(fd);
  234. return IOError("while fcntl NoCache", fname, errno);
  235. }
  236. #endif
  237. }
  238. result->reset(new PosixRandomAccessFile(
  239. fname, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd),
  240. options
  241. #if defined(ROCKSDB_IOURING_PRESENT)
  242. ,
  243. !IsIOUringEnabled() ? nullptr : thread_local_io_urings_.get()
  244. #endif
  245. ));
  246. }
  247. return s;
  248. }
  249. virtual IOStatus OpenWritableFile(const std::string& fname,
  250. const FileOptions& options, bool reopen,
  251. std::unique_ptr<FSWritableFile>* result,
  252. IODebugContext* /*dbg*/) {
  253. result->reset();
  254. IOStatus s;
  255. int fd = -1;
  256. int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC);
  257. // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
  258. if (options.use_direct_writes && !options.use_mmap_writes) {
  259. // Note: we should avoid O_APPEND here due to ta the following bug:
  260. // POSIX requires that opening a file with the O_APPEND flag should
  261. // have no affect on the location at which pwrite() writes data.
  262. // However, on Linux, if a file is opened with O_APPEND, pwrite()
  263. // appends data to the end of the file, regardless of the value of
  264. // offset.
  265. // More info here: https://linux.die.net/man/2/pwrite
  266. flags |= O_WRONLY;
  267. #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
  268. flags |= O_DIRECT;
  269. #endif
  270. TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
  271. } else if (options.use_mmap_writes) {
  272. // non-direct I/O
  273. flags |= O_RDWR;
  274. } else {
  275. flags |= O_WRONLY;
  276. }
  277. flags = cloexec_flags(flags, &options);
  278. do {
  279. IOSTATS_TIMER_GUARD(open_nanos);
  280. fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
  281. } while (fd < 0 && errno == EINTR);
  282. if (fd < 0) {
  283. s = IOError("While open a file for appending", fname, errno);
  284. return s;
  285. }
  286. SetFD_CLOEXEC(fd, &options);
  287. if (options.use_mmap_writes) {
  288. MaybeForceDisableMmap(fd);
  289. }
  290. uint64_t initial_file_size = 0;
  291. if (reopen) {
  292. s = GetFileSizeOnOpenedFile(fd, fname, &initial_file_size);
  293. if (!s.ok()) {
  294. close(fd);
  295. return s;
  296. }
  297. }
  298. if (options.use_mmap_writes && !forceMmapOff_) {
  299. result->reset(
  300. new PosixMmapFile(fname, fd, page_size_, options, initial_file_size));
  301. } else if (options.use_direct_writes && !options.use_mmap_writes) {
  302. #ifdef OS_MACOSX
  303. if (fcntl(fd, F_NOCACHE, 1) == -1) {
  304. close(fd);
  305. s = IOError("While fcntl NoCache an opened file for appending", fname,
  306. errno);
  307. return s;
  308. }
  309. #elif defined(OS_SOLARIS)
  310. if (directio(fd, DIRECTIO_ON) == -1) {
  311. if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON
  312. close(fd);
  313. s = IOError("While calling directio()", fname, errno);
  314. return s;
  315. }
  316. }
  317. #endif
  318. result->reset(new PosixWritableFile(
  319. fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
  320. options, initial_file_size));
  321. } else {
  322. // disable mmap writes
  323. EnvOptions no_mmap_writes_options = options;
  324. no_mmap_writes_options.use_mmap_writes = false;
  325. result->reset(
  326. new PosixWritableFile(fname, fd,
  327. GetLogicalBlockSizeForWriteIfNeeded(
  328. no_mmap_writes_options, fname, fd),
  329. no_mmap_writes_options, initial_file_size));
  330. }
  331. return s;
  332. }
  333. IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
  334. std::unique_ptr<FSWritableFile>* result,
  335. IODebugContext* dbg) override {
  336. return OpenWritableFile(fname, options, false, result, dbg);
  337. }
  338. IOStatus ReopenWritableFile(const std::string& fname,
  339. const FileOptions& options,
  340. std::unique_ptr<FSWritableFile>* result,
  341. IODebugContext* dbg) override {
  342. return OpenWritableFile(fname, options, true, result, dbg);
  343. }
  344. IOStatus ReuseWritableFile(const std::string& fname,
  345. const std::string& old_fname,
  346. const FileOptions& options,
  347. std::unique_ptr<FSWritableFile>* result,
  348. IODebugContext* /*dbg*/) override {
  349. result->reset();
  350. IOStatus s;
  351. int fd = -1;
  352. int flags = 0;
  353. // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
  354. if (options.use_direct_writes && !options.use_mmap_writes) {
  355. flags |= O_WRONLY;
  356. #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
  357. flags |= O_DIRECT;
  358. #endif
  359. TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
  360. } else if (options.use_mmap_writes) {
  361. // mmap needs O_RDWR mode
  362. flags |= O_RDWR;
  363. } else {
  364. flags |= O_WRONLY;
  365. }
  366. flags = cloexec_flags(flags, &options);
  367. do {
  368. IOSTATS_TIMER_GUARD(open_nanos);
  369. fd = open(old_fname.c_str(), flags,
  370. GetDBFileMode(allow_non_owner_access_));
  371. } while (fd < 0 && errno == EINTR);
  372. if (fd < 0) {
  373. s = IOError("while reopen file for write", fname, errno);
  374. return s;
  375. }
  376. SetFD_CLOEXEC(fd, &options);
  377. // rename into place
  378. if (rename(old_fname.c_str(), fname.c_str()) != 0) {
  379. s = IOError("while rename file to " + fname, old_fname, errno);
  380. close(fd);
  381. return s;
  382. }
  383. if (options.use_mmap_writes) {
  384. MaybeForceDisableMmap(fd);
  385. }
  386. if (options.use_mmap_writes && !forceMmapOff_) {
  387. result->reset(new PosixMmapFile(fname, fd, page_size_, options,
  388. /*initial_file_size=*/0));
  389. } else if (options.use_direct_writes && !options.use_mmap_writes) {
  390. #ifdef OS_MACOSX
  391. if (fcntl(fd, F_NOCACHE, 1) == -1) {
  392. close(fd);
  393. s = IOError("while fcntl NoCache for reopened file for append", fname,
  394. errno);
  395. return s;
  396. }
  397. #elif defined(OS_SOLARIS)
  398. if (directio(fd, DIRECTIO_ON) == -1) {
  399. if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON
  400. close(fd);
  401. s = IOError("while calling directio()", fname, errno);
  402. return s;
  403. }
  404. }
  405. #endif
  406. result->reset(new PosixWritableFile(
  407. fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
  408. options, /*initial_file_size=*/0));
  409. } else {
  410. // disable mmap writes
  411. FileOptions no_mmap_writes_options = options;
  412. no_mmap_writes_options.use_mmap_writes = false;
  413. result->reset(new PosixWritableFile(
  414. fname, fd,
  415. GetLogicalBlockSizeForWriteIfNeeded(no_mmap_writes_options, fname,
  416. fd),
  417. no_mmap_writes_options, /*initial_file_size=*/0));
  418. }
  419. return s;
  420. }
  421. IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
  422. std::unique_ptr<FSRandomRWFile>* result,
  423. IODebugContext* /*dbg*/) override {
  424. int fd = -1;
  425. int flags = cloexec_flags(O_RDWR, &options);
  426. while (fd < 0) {
  427. IOSTATS_TIMER_GUARD(open_nanos);
  428. fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
  429. if (fd < 0) {
  430. // Error while opening the file
  431. if (errno == EINTR) {
  432. continue;
  433. }
  434. return IOError("While open file for random read/write", fname, errno);
  435. }
  436. }
  437. SetFD_CLOEXEC(fd, &options);
  438. result->reset(new PosixRandomRWFile(fname, fd, options));
  439. return IOStatus::OK();
  440. }
  441. IOStatus NewMemoryMappedFileBuffer(
  442. const std::string& fname,
  443. std::unique_ptr<MemoryMappedFileBuffer>* result) override {
  444. int fd = -1;
  445. IOStatus status;
  446. int flags = cloexec_flags(O_RDWR, nullptr);
  447. while (fd < 0) {
  448. IOSTATS_TIMER_GUARD(open_nanos);
  449. fd = open(fname.c_str(), flags, 0644);
  450. if (fd < 0) {
  451. // Error while opening the file
  452. if (errno == EINTR) {
  453. continue;
  454. }
  455. status =
  456. IOError("While open file for raw mmap buffer access", fname, errno);
  457. break;
  458. }
  459. }
  460. uint64_t size;
  461. if (status.ok()) {
  462. IOOptions opts;
  463. status = GetFileSizeOnOpenedFile(fd, fname, &size);
  464. }
  465. void* base = nullptr;
  466. if (status.ok()) {
  467. base = mmap(nullptr, static_cast<size_t>(size), PROT_READ | PROT_WRITE,
  468. MAP_SHARED, fd, 0);
  469. if (base == MAP_FAILED) {
  470. status = IOError("while mmap file for read", fname, errno);
  471. }
  472. }
  473. if (status.ok()) {
  474. result->reset(
  475. new PosixMemoryMappedFileBuffer(base, static_cast<size_t>(size)));
  476. }
  477. if (fd >= 0) {
  478. // don't need to keep it open after mmap has been called
  479. close(fd);
  480. }
  481. return status;
  482. }
  483. IOStatus NewDirectory(const std::string& name, const IOOptions& /*opts*/,
  484. std::unique_ptr<FSDirectory>* result,
  485. IODebugContext* /*dbg*/) override {
  486. result->reset();
  487. int fd;
  488. int flags = cloexec_flags(0, nullptr);
  489. {
  490. IOSTATS_TIMER_GUARD(open_nanos);
  491. fd = open(name.c_str(), flags);
  492. }
  493. if (fd < 0) {
  494. return IOError("While open directory", name, errno);
  495. } else {
  496. result->reset(new PosixDirectory(fd, name));
  497. }
  498. return IOStatus::OK();
  499. }
  500. IOStatus FileExists(const std::string& fname, const IOOptions& /*opts*/,
  501. IODebugContext* /*dbg*/) override {
  502. int result = access(fname.c_str(), F_OK);
  503. if (result == 0) {
  504. return IOStatus::OK();
  505. }
  506. int err = errno;
  507. switch (err) {
  508. case EACCES:
  509. case ELOOP:
  510. case ENAMETOOLONG:
  511. case ENOENT:
  512. case ENOTDIR:
  513. return IOStatus::NotFound();
  514. default:
  515. assert(err == EIO || err == ENOMEM);
  516. return IOStatus::IOError("Unexpected error(" + std::to_string(err) +
  517. ") accessing file `" + fname + "' ");
  518. }
  519. }
  520. IOStatus GetChildren(const std::string& dir, const IOOptions& opts,
  521. std::vector<std::string>* result,
  522. IODebugContext* /*dbg*/) override {
  523. result->clear();
  524. DIR* d = opendir(dir.c_str());
  525. if (d == nullptr) {
  526. switch (errno) {
  527. case EACCES:
  528. case ENOENT:
  529. case ENOTDIR:
  530. return IOStatus::NotFound();
  531. default:
  532. return IOError("While opendir", dir, errno);
  533. }
  534. }
  535. // reset errno before calling readdir()
  536. errno = 0;
  537. struct dirent* entry;
  538. while ((entry = readdir(d)) != nullptr) {
  539. // filter out '.' and '..' directory entries
  540. // which appear only on some platforms
  541. const bool ignore =
  542. entry->d_type == DT_DIR &&
  543. (strcmp(entry->d_name, ".") == 0 ||
  544. strcmp(entry->d_name, "..") == 0
  545. #ifndef ASSERT_STATUS_CHECKED
  546. // In case of ASSERT_STATUS_CHECKED, GetChildren support older
  547. // version of API for debugging purpose.
  548. || opts.do_not_recurse
  549. #endif
  550. );
  551. if (!ignore) {
  552. result->push_back(entry->d_name);
  553. }
  554. errno = 0; // reset errno if readdir() success
  555. }
  556. // always attempt to close the dir
  557. const auto pre_close_errno = errno; // errno may be modified by closedir
  558. const int close_result = closedir(d);
  559. if (pre_close_errno != 0) {
  560. // error occurred during readdir
  561. return IOError("While readdir", dir, pre_close_errno);
  562. }
  563. if (close_result != 0) {
  564. // error occurred during closedir
  565. return IOError("While closedir", dir, errno);
  566. }
  567. return IOStatus::OK();
  568. }
  569. IOStatus DeleteFile(const std::string& fname, const IOOptions& /*opts*/,
  570. IODebugContext* /*dbg*/) override {
  571. IOStatus result;
  572. if (unlink(fname.c_str()) != 0) {
  573. result = IOError("while unlink() file", fname, errno);
  574. }
  575. return result;
  576. }
  577. IOStatus CreateDir(const std::string& name, const IOOptions& /*opts*/,
  578. IODebugContext* /*dbg*/) override {
  579. if (mkdir(name.c_str(), 0755) != 0) {
  580. return IOError("While mkdir", name, errno);
  581. }
  582. return IOStatus::OK();
  583. }
  584. IOStatus CreateDirIfMissing(const std::string& name,
  585. const IOOptions& /*opts*/,
  586. IODebugContext* /*dbg*/) override {
  587. if (mkdir(name.c_str(), 0755) != 0) {
  588. if (errno != EEXIST) {
  589. return IOError("While mkdir if missing", name, errno);
  590. } else if (!DirExists(name)) { // Check that name is actually a
  591. // directory.
  592. // Message is taken from mkdir
  593. return IOStatus::IOError("`" + name +
  594. "' exists but is not a directory");
  595. }
  596. }
  597. return IOStatus::OK();
  598. }
  599. IOStatus DeleteDir(const std::string& name, const IOOptions& /*opts*/,
  600. IODebugContext* /*dbg*/) override {
  601. if (rmdir(name.c_str()) != 0) {
  602. return IOError("file rmdir", name, errno);
  603. }
  604. return IOStatus::OK();
  605. }
  606. IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/,
  607. uint64_t* size, IODebugContext* /*dbg*/) override {
  608. struct stat sbuf {};
  609. if (stat(fname.c_str(), &sbuf) != 0) {
  610. *size = 0;
  611. return IOError("while stat a file for size", fname, errno);
  612. } else {
  613. *size = sbuf.st_size;
  614. }
  615. return IOStatus::OK();
  616. }
  617. IOStatus GetFileModificationTime(const std::string& fname,
  618. const IOOptions& /*opts*/,
  619. uint64_t* file_mtime,
  620. IODebugContext* /*dbg*/) override {
  621. struct stat s;
  622. if (stat(fname.c_str(), &s) != 0) {
  623. return IOError("while stat a file for modification time", fname, errno);
  624. }
  625. *file_mtime = static_cast<uint64_t>(s.st_mtime);
  626. return IOStatus::OK();
  627. }
  628. IOStatus RenameFile(const std::string& src, const std::string& target,
  629. const IOOptions& /*opts*/,
  630. IODebugContext* /*dbg*/) override {
  631. if (rename(src.c_str(), target.c_str()) != 0) {
  632. return IOError("While renaming a file to " + target, src, errno);
  633. }
  634. return IOStatus::OK();
  635. }
  636. IOStatus LinkFile(const std::string& src, const std::string& target,
  637. const IOOptions& /*opts*/,
  638. IODebugContext* /*dbg*/) override {
  639. if (link(src.c_str(), target.c_str()) != 0) {
  640. if (errno == EXDEV || errno == ENOTSUP) {
  641. return IOStatus::NotSupported(errno == EXDEV
  642. ? "No cross FS links allowed"
  643. : "Links not supported by FS");
  644. }
  645. return IOError("while link file to " + target, src, errno);
  646. }
  647. return IOStatus::OK();
  648. }
  649. IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*opts*/,
  650. uint64_t* count, IODebugContext* /*dbg*/) override {
  651. struct stat s;
  652. if (stat(fname.c_str(), &s) != 0) {
  653. return IOError("while stat a file for num file links", fname, errno);
  654. }
  655. *count = static_cast<uint64_t>(s.st_nlink);
  656. return IOStatus::OK();
  657. }
  658. IOStatus AreFilesSame(const std::string& first, const std::string& second,
  659. const IOOptions& /*opts*/, bool* res,
  660. IODebugContext* /*dbg*/) override {
  661. struct stat statbuf[2];
  662. if (stat(first.c_str(), &statbuf[0]) != 0) {
  663. return IOError("stat file", first, errno);
  664. }
  665. if (stat(second.c_str(), &statbuf[1]) != 0) {
  666. return IOError("stat file", second, errno);
  667. }
  668. if (major(statbuf[0].st_dev) != major(statbuf[1].st_dev) ||
  669. minor(statbuf[0].st_dev) != minor(statbuf[1].st_dev) ||
  670. statbuf[0].st_ino != statbuf[1].st_ino) {
  671. *res = false;
  672. } else {
  673. *res = true;
  674. }
  675. return IOStatus::OK();
  676. }
  677. IOStatus LockFile(const std::string& fname, const IOOptions& /*opts*/,
  678. FileLock** lock, IODebugContext* /*dbg*/) override {
  679. *lock = nullptr;
  680. LockHoldingInfo lhi;
  681. int64_t current_time = 0;
  682. // Ignore status code as the time is only used for error message.
  683. SystemClock::Default()
  684. ->GetCurrentTime(&current_time)
  685. .PermitUncheckedError();
  686. lhi.acquire_time = current_time;
  687. lhi.acquiring_thread = Env::Default()->GetThreadID();
  688. mutex_locked_files.Lock();
  689. // If it already exists in the locked_files set, then it is already locked,
  690. // and fail this lock attempt. Otherwise, insert it into locked_files.
  691. // This check is needed because fcntl() does not detect lock conflict
  692. // if the fcntl is issued by the same thread that earlier acquired
  693. // this lock.
  694. // We must do this check *before* opening the file:
  695. // Otherwise, we will open a new file descriptor. Locks are associated with
  696. // a process, not a file descriptor and when *any* file descriptor is
  697. // closed, all locks the process holds for that *file* are released
  698. const auto it_success = locked_files.insert({fname, lhi});
  699. if (it_success.second == false) {
  700. LockHoldingInfo prev_info = it_success.first->second;
  701. mutex_locked_files.Unlock();
  702. errno = ENOLCK;
  703. // Note that the thread ID printed is the same one as the one in
  704. // posix logger, but posix logger prints it hex format.
  705. return IOError("lock hold by current process, acquire time " +
  706. std::to_string(prev_info.acquire_time) +
  707. " acquiring thread " +
  708. std::to_string(prev_info.acquiring_thread),
  709. fname, errno);
  710. }
  711. IOStatus result = IOStatus::OK();
  712. int fd;
  713. int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr);
  714. {
  715. IOSTATS_TIMER_GUARD(open_nanos);
  716. fd = open(fname.c_str(), flags, 0644);
  717. }
  718. if (fd < 0) {
  719. result = IOError("while open a file for lock", fname, errno);
  720. } else if (LockOrUnlock(fd, true) == -1) {
  721. result = IOError("While lock file", fname, errno);
  722. close(fd);
  723. } else {
  724. SetFD_CLOEXEC(fd, nullptr);
  725. PosixFileLock* my_lock = new PosixFileLock;
  726. my_lock->fd_ = fd;
  727. my_lock->filename = fname;
  728. *lock = my_lock;
  729. }
  730. if (!result.ok()) {
  731. // If there is an error in locking, then remove the pathname from
  732. // locked_files. (If we got this far, it did not exist in locked_files
  733. // before this call.)
  734. locked_files.erase(fname);
  735. }
  736. mutex_locked_files.Unlock();
  737. return result;
  738. }
  739. IOStatus UnlockFile(FileLock* lock, const IOOptions& /*opts*/,
  740. IODebugContext* /*dbg*/) override {
  741. PosixFileLock* my_lock = static_cast<PosixFileLock*>(lock);
  742. IOStatus result;
  743. mutex_locked_files.Lock();
  744. // If we are unlocking, then verify that we had locked it earlier,
  745. // it should already exist in locked_files. Remove it from locked_files.
  746. if (locked_files.erase(my_lock->filename) != 1) {
  747. errno = ENOLCK;
  748. result = IOError("unlock", my_lock->filename, errno);
  749. } else if (LockOrUnlock(my_lock->fd_, false) == -1) {
  750. result = IOError("unlock", my_lock->filename, errno);
  751. }
  752. close(my_lock->fd_);
  753. my_lock->Clear();
  754. delete my_lock;
  755. mutex_locked_files.Unlock();
  756. return result;
  757. }
  758. IOStatus GetAbsolutePath(const std::string& db_path,
  759. const IOOptions& /*opts*/, std::string* output_path,
  760. IODebugContext* /*dbg*/) override {
  761. if (!db_path.empty() && db_path[0] == '/') {
  762. *output_path = db_path;
  763. return IOStatus::OK();
  764. }
  765. char the_path[4096];
  766. char* ret = getcwd(the_path, 4096);
  767. if (ret == nullptr) {
  768. return IOStatus::IOError(errnoStr(errno).c_str());
  769. }
  770. *output_path = ret;
  771. return IOStatus::OK();
  772. }
  773. IOStatus GetTestDirectory(const IOOptions& /*opts*/, std::string* result,
  774. IODebugContext* /*dbg*/) override {
  775. const char* env = getenv("TEST_TMPDIR");
  776. if (env && env[0] != '\0') {
  777. *result = env;
  778. } else {
  779. char buf[100];
  780. snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid()));
  781. *result = buf;
  782. }
  783. // Directory may already exist
  784. {
  785. IOOptions opts;
  786. return CreateDirIfMissing(*result, opts, nullptr);
  787. }
  788. }
  789. IOStatus GetFreeSpace(const std::string& fname, const IOOptions& /*opts*/,
  790. uint64_t* free_space,
  791. IODebugContext* /*dbg*/) override {
  792. struct statvfs sbuf;
  793. if (statvfs(fname.c_str(), &sbuf) < 0) {
  794. return IOError("While doing statvfs", fname, errno);
  795. }
  796. // sbuf.bfree is total free space available to root
  797. // sbuf.bavail is total free space available to unprivileged user
  798. // sbuf.bavail <= sbuf.bfree ... pick correct based upon effective user id
  799. if (geteuid()) {
  800. // non-zero user is unprivileged, or -1 if error. take more conservative
  801. // size
  802. *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bavail);
  803. } else {
  804. // root user can access all disk space
  805. *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
  806. }
  807. return IOStatus::OK();
  808. }
  809. IOStatus IsDirectory(const std::string& path, const IOOptions& /*opts*/,
  810. bool* is_dir, IODebugContext* /*dbg*/) override {
  811. // First open
  812. int fd = -1;
  813. int flags = cloexec_flags(O_RDONLY, nullptr);
  814. {
  815. IOSTATS_TIMER_GUARD(open_nanos);
  816. fd = open(path.c_str(), flags);
  817. }
  818. if (fd < 0) {
  819. return IOError("While open for IsDirectory()", path, errno);
  820. }
  821. IOStatus io_s;
  822. struct stat sbuf;
  823. if (fstat(fd, &sbuf) < 0) {
  824. io_s = IOError("While doing stat for IsDirectory()", path, errno);
  825. }
  826. close(fd);
  827. if (io_s.ok() && nullptr != is_dir) {
  828. *is_dir = S_ISDIR(sbuf.st_mode);
  829. }
  830. return io_s;
  831. }
  832. FileOptions OptimizeForLogWrite(const FileOptions& file_options,
  833. const DBOptions& db_options) const override {
  834. FileOptions optimized = file_options;
  835. optimized.use_mmap_writes = false;
  836. optimized.use_direct_writes = false;
  837. optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
  838. // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
  839. // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
  840. // test and make this false
  841. optimized.fallocate_with_keep_size = true;
  842. optimized.writable_file_max_buffer_size =
  843. db_options.writable_file_max_buffer_size;
  844. return optimized;
  845. }
  846. FileOptions OptimizeForManifestWrite(
  847. const FileOptions& file_options) const override {
  848. FileOptions optimized = file_options;
  849. optimized.use_mmap_writes = false;
  850. optimized.use_direct_writes = false;
  851. optimized.fallocate_with_keep_size = true;
  852. return optimized;
  853. }
  854. FileOptions OptimizeForCompactionTableRead(
  855. const FileOptions& file_options,
  856. const ImmutableDBOptions& db_options) const override {
  857. FileOptions fo = FileOptions(file_options);
  858. #ifdef OS_LINUX
  859. // To fix https://github.com/facebook/rocksdb/issues/12038
  860. if (!file_options.use_direct_reads &&
  861. file_options.compaction_readahead_size > 0) {
  862. size_t system_limit =
  863. GetCompactionReadaheadSizeSystemLimit(db_options.db_paths);
  864. if (system_limit > 0 &&
  865. file_options.compaction_readahead_size > system_limit) {
  866. fo.compaction_readahead_size = system_limit;
  867. }
  868. }
  869. #else
  870. (void)db_options;
  871. #endif
  872. return fo;
  873. }
  874. #ifdef OS_LINUX
  875. Status RegisterDbPaths(const std::vector<std::string>& paths) override {
  876. return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths);
  877. }
  878. Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
  879. logical_block_size_cache_.UnrefAndTryRemoveCachedLogicalBlockSize(paths);
  880. return Status::OK();
  881. }
  882. #endif
  883. private:
  884. bool forceMmapOff_ = false; // do we override Env options?
  885. // This is a faster API comparing to the public method that uses stat to get
  886. // file size. However this API only works on opened file.
  887. IOStatus GetFileSizeOnOpenedFile(const int fd, const std::string& name,
  888. uint64_t* size) {
  889. struct stat sb {};
  890. *size = 0;
  891. // Get file information using fstat
  892. if (fstat(fd, &sb) == -1) {
  893. return IOError(
  894. "while fstat a file for size with fd " + std::to_string(fd), name,
  895. errno);
  896. }
  897. *size = sb.st_size;
  898. return IOStatus::OK();
  899. }
  900. #ifdef OS_LINUX
  901. // Get the minimum "linux system limit" (i.e, the largest I/O size that the OS
  902. // can issue to block devices under a directory, also known as
  903. // "max_sectors_kb" ) among `db_paths`.
  904. // Return 0 if no limit can be found or there is an error in
  905. // retrieving such limit.
  906. static size_t GetCompactionReadaheadSizeSystemLimit(
  907. const std::vector<DbPath>& db_paths) {
  908. Status s;
  909. size_t limit_kb = 0;
  910. for (const auto& db_path : db_paths) {
  911. size_t dir_max_sectors_kb = 0;
  912. s = PosixHelper::GetMaxSectorsKBOfDirectory(db_path.path,
  913. &dir_max_sectors_kb);
  914. if (!s.ok()) {
  915. break;
  916. }
  917. limit_kb = (limit_kb == 0) ? dir_max_sectors_kb
  918. : std::min(limit_kb, dir_max_sectors_kb);
  919. }
  920. if (s.ok()) {
  921. return limit_kb * 1024;
  922. } else {
  923. return 0;
  924. }
  925. }
  926. #endif
  927. // Returns true iff the named directory exists and is a directory.
  928. virtual bool DirExists(const std::string& dname) {
  929. struct stat statbuf;
  930. if (stat(dname.c_str(), &statbuf) == 0) {
  931. return S_ISDIR(statbuf.st_mode);
  932. }
  933. return false; // stat() failed return false
  934. }
  935. bool SupportsFastAllocate(int fd) {
  936. #ifdef ROCKSDB_FALLOCATE_PRESENT
  937. struct statfs s;
  938. if (fstatfs(fd, &s)) {
  939. return false;
  940. }
  941. switch (s.f_type) {
  942. case EXT4_SUPER_MAGIC:
  943. return true;
  944. case XFS_SUPER_MAGIC:
  945. return true;
  946. case TMPFS_MAGIC:
  947. return true;
  948. default:
  949. return false;
  950. }
  951. #else
  952. (void)fd;
  953. return false;
  954. #endif
  955. }
  956. void MaybeForceDisableMmap(int fd) {
  957. static std::once_flag s_check_disk_for_mmap_once;
  958. assert(this == FileSystem::Default().get());
  959. std::call_once(
  960. s_check_disk_for_mmap_once,
  961. [this](int fdesc) {
  962. // this will be executed once in the program's lifetime.
  963. // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
  964. if (!SupportsFastAllocate(fdesc)) {
  965. forceMmapOff_ = true;
  966. }
  967. },
  968. fd);
  969. }
  970. #ifdef ROCKSDB_IOURING_PRESENT
  971. bool IsIOUringEnabled() {
  972. if (RocksDbIOUringEnable && RocksDbIOUringEnable()) {
  973. return true;
  974. } else {
  975. return false;
  976. }
  977. }
  978. #endif // ROCKSDB_IOURING_PRESENT
  979. // TODO:
  980. // 1. Update Poll API to take into account min_completions
  981. // and returns if number of handles in io_handles (any order) completed is
  982. // equal to atleast min_completions.
  983. // 2. Currently in case of direct_io, Read API is called because of which call
  984. // to Poll API fails as it expects IOHandle to be populated.
  985. IOStatus Poll(std::vector<void*>& io_handles,
  986. size_t /*min_completions*/) override {
  987. #if defined(ROCKSDB_IOURING_PRESENT)
  988. // io_uring_queue_init.
  989. struct io_uring* iu = nullptr;
  990. if (thread_local_io_urings_) {
  991. iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
  992. }
  993. // Init failed, platform doesn't support io_uring.
  994. if (iu == nullptr) {
  995. return IOStatus::NotSupported("Poll");
  996. }
  997. for (size_t i = 0; i < io_handles.size(); i++) {
  998. // The request has been completed in earlier runs.
  999. if ((static_cast<Posix_IOHandle*>(io_handles[i]))->is_finished) {
  1000. continue;
  1001. }
  1002. // Loop until IO for io_handles[i] is completed.
  1003. while (true) {
  1004. // io_uring_wait_cqe.
  1005. struct io_uring_cqe* cqe = nullptr;
  1006. ssize_t ret = io_uring_wait_cqe(iu, &cqe);
  1007. if (ret) {
  1008. fprintf(stderr, "Poll: io_uring_wait_cqe failed: %ld", (long)ret);
  1009. if (ret == -EINTR || ret == -EAGAIN) {
  1010. continue; // Retry
  1011. }
  1012. abort();
  1013. }
  1014. // Step 3: Populate the request.
  1015. assert(cqe != nullptr);
  1016. Posix_IOHandle* posix_handle =
  1017. static_cast<Posix_IOHandle*>(io_uring_cqe_get_data(cqe));
  1018. assert(posix_handle->iu == iu);
  1019. if (posix_handle->iu != iu) {
  1020. return IOStatus::IOError("");
  1021. }
  1022. // Reset cqe data to catch any stray reuse of it
  1023. static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
  1024. FSReadRequest req;
  1025. req.scratch = posix_handle->scratch;
  1026. req.offset = posix_handle->offset;
  1027. req.len = posix_handle->len;
  1028. size_t finished_len = 0;
  1029. size_t bytes_read = 0;
  1030. bool read_again = false;
  1031. UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len,
  1032. true /*async_read*/, posix_handle->use_direct_io,
  1033. posix_handle->alignment, finished_len, &req, bytes_read,
  1034. read_again);
  1035. posix_handle->is_finished = true;
  1036. io_uring_cqe_seen(iu, cqe);
  1037. posix_handle->cb(req, posix_handle->cb_arg);
  1038. (void)finished_len;
  1039. (void)bytes_read;
  1040. (void)read_again;
  1041. if (static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
  1042. break;
  1043. }
  1044. }
  1045. }
  1046. return IOStatus::OK();
  1047. #else
  1048. (void)io_handles;
  1049. return IOStatus::NotSupported("Poll not implemented");
  1050. #endif
  1051. }
  1052. IOStatus AbortIO(std::vector<void*>& io_handles) override {
  1053. #if defined(ROCKSDB_IOURING_PRESENT)
  1054. // io_uring_queue_init.
  1055. struct io_uring* iu = nullptr;
  1056. if (thread_local_io_urings_) {
  1057. iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
  1058. }
  1059. // Init failed, platform doesn't support io_uring.
  1060. // If Poll is not supported then it didn't submit any request and it should
  1061. // return OK.
  1062. if (iu == nullptr) {
  1063. return IOStatus::OK();
  1064. }
  1065. for (size_t i = 0; i < io_handles.size(); i++) {
  1066. Posix_IOHandle* posix_handle =
  1067. static_cast<Posix_IOHandle*>(io_handles[i]);
  1068. if (posix_handle->is_finished == true) {
  1069. continue;
  1070. }
  1071. assert(posix_handle->iu == iu);
  1072. if (posix_handle->iu != iu) {
  1073. return IOStatus::IOError("");
  1074. }
  1075. // Prepare the cancel request.
  1076. struct io_uring_sqe* sqe;
  1077. sqe = io_uring_get_sqe(iu);
  1078. // In order to cancel the request, sqe->addr of cancel request should
  1079. // match with the read request submitted which is posix_handle->iov.
  1080. io_uring_prep_cancel(sqe, &posix_handle->iov, 0);
  1081. // Sets sqe->user_data to posix_handle.
  1082. io_uring_sqe_set_data(sqe, posix_handle);
  1083. // submit the request.
  1084. ssize_t ret = io_uring_submit(iu);
  1085. if (ret < 0) {
  1086. fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
  1087. return IOStatus::IOError("io_uring_submit() requested but returned " +
  1088. std::to_string(ret));
  1089. }
  1090. }
  1091. // After submitting the requests, wait for the requests.
  1092. for (size_t i = 0; i < io_handles.size(); i++) {
  1093. if ((static_cast<Posix_IOHandle*>(io_handles[i]))->is_finished) {
  1094. continue;
  1095. }
  1096. while (true) {
  1097. struct io_uring_cqe* cqe = nullptr;
  1098. ssize_t ret = io_uring_wait_cqe(iu, &cqe);
  1099. if (ret) {
  1100. fprintf(stderr, "AbortIO: io_uring_wait_cqe failed: %ld", (long)ret);
  1101. if (ret == -EINTR || ret == -EAGAIN) {
  1102. continue; // Retry
  1103. }
  1104. abort();
  1105. }
  1106. assert(cqe != nullptr);
  1107. // Returns cqe->user_data.
  1108. Posix_IOHandle* posix_handle =
  1109. static_cast<Posix_IOHandle*>(io_uring_cqe_get_data(cqe));
  1110. assert(posix_handle->iu == iu);
  1111. if (posix_handle->iu != iu) {
  1112. return IOStatus::IOError("");
  1113. }
  1114. posix_handle->req_count++;
  1115. // Reset cqe data to catch any stray reuse of it
  1116. static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
  1117. io_uring_cqe_seen(iu, cqe);
  1118. // - If the request is cancelled successfully, the original request is
  1119. // completed with -ECANCELED and the cancel request is completed with
  1120. // a result of 0.
  1121. // - If the request was already running, the original may or
  1122. // may not complete in error. The cancel request will complete with
  1123. // -EALREADY for that case.
  1124. // - And finally, if the request to cancel wasn't
  1125. // found, the cancel request is completed with -ENOENT.
  1126. //
  1127. // Every handle has to wait for 2 requests completion: original one and
  1128. // the cancel request which is tracked by PosixHandle::req_count.
  1129. if (posix_handle->req_count == 2 &&
  1130. static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
  1131. posix_handle->is_finished = true;
  1132. FSReadRequest req;
  1133. req.status = IOStatus::Aborted();
  1134. posix_handle->cb(req, posix_handle->cb_arg);
  1135. break;
  1136. }
  1137. }
  1138. }
  1139. return IOStatus::OK();
  1140. #else
  1141. // If Poll is not supported then it didn't submit any request and it should
  1142. // return OK.
  1143. (void)io_handles;
  1144. return IOStatus::OK();
  1145. #endif
  1146. }
  1147. void SupportedOps(int64_t& supported_ops) override {
  1148. supported_ops = 0;
  1149. #if defined(ROCKSDB_IOURING_PRESENT)
  1150. if (IsIOUringEnabled()) {
  1151. // Underlying FS supports async_io
  1152. supported_ops |= (1 << FSSupportedOps::kAsyncIO);
  1153. }
  1154. #endif
  1155. supported_ops |= (1 << FSSupportedOps::kFSPrefetch);
  1156. }
  1157. #if defined(ROCKSDB_IOURING_PRESENT)
  1158. // io_uring instance
  1159. std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_;
  1160. #endif
  1161. size_t page_size_;
  1162. // If true, allow non owner read access for db files. Otherwise, non-owner
  1163. // has no access to db files.
  1164. bool allow_non_owner_access_;
  1165. #ifdef OS_LINUX
  1166. static LogicalBlockSizeCache logical_block_size_cache_;
  1167. #endif
  1168. static size_t GetLogicalBlockSize(const std::string& fname, int fd);
  1169. // In non-direct IO mode, this directly returns kDefaultPageSize.
  1170. // Otherwise call GetLogicalBlockSize.
  1171. static size_t GetLogicalBlockSizeForReadIfNeeded(const EnvOptions& options,
  1172. const std::string& fname,
  1173. int fd);
  1174. static size_t GetLogicalBlockSizeForWriteIfNeeded(const EnvOptions& options,
  1175. const std::string& fname,
  1176. int fd);
  1177. };
  1178. #ifdef OS_LINUX
  1179. LogicalBlockSizeCache PosixFileSystem::logical_block_size_cache_;
  1180. #endif
  1181. size_t PosixFileSystem::GetLogicalBlockSize(const std::string& fname, int fd) {
  1182. #ifdef OS_LINUX
  1183. return logical_block_size_cache_.GetLogicalBlockSize(fname, fd);
  1184. #else
  1185. (void)fname;
  1186. return PosixHelper::GetLogicalBlockSizeOfFd(fd);
  1187. #endif
  1188. }
  1189. size_t PosixFileSystem::GetLogicalBlockSizeForReadIfNeeded(
  1190. const EnvOptions& options, const std::string& fname, int fd) {
  1191. return options.use_direct_reads
  1192. ? PosixFileSystem::GetLogicalBlockSize(fname, fd)
  1193. : kDefaultPageSize;
  1194. }
  1195. size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded(
  1196. const EnvOptions& options, const std::string& fname, int fd) {
  1197. return options.use_direct_writes
  1198. ? PosixFileSystem::GetLogicalBlockSize(fname, fd)
  1199. : kDefaultPageSize;
  1200. }
  1201. PosixFileSystem::PosixFileSystem()
  1202. : forceMmapOff_(false),
  1203. page_size_(getpagesize()),
  1204. allow_non_owner_access_(true) {
  1205. #if defined(ROCKSDB_IOURING_PRESENT)
  1206. // Test whether IOUring is supported, and if it does, create a managing
  1207. // object for thread local point so that in the future thread-local
  1208. // io_uring can be created.
  1209. struct io_uring* new_io_uring = CreateIOUring();
  1210. if (new_io_uring != nullptr) {
  1211. thread_local_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
  1212. delete new_io_uring;
  1213. }
  1214. #endif
  1215. }
  1216. } // namespace
  1217. //
  1218. // Default Posix FileSystem
  1219. //
  1220. std::shared_ptr<FileSystem> FileSystem::Default() {
  1221. STATIC_AVOID_DESTRUCTION(std::shared_ptr<FileSystem>, instance)
  1222. (std::make_shared<PosixFileSystem>());
  1223. return instance;
  1224. }
  1225. static FactoryFunc<FileSystem> posix_filesystem_reg =
  1226. ObjectLibrary::Default()->AddFactory<FileSystem>(
  1227. ObjectLibrary::PatternEntry("posix").AddSeparator("://", false),
  1228. [](const std::string& /* uri */, std::unique_ptr<FileSystem>* f,
  1229. std::string* /* errmsg */) {
  1230. f->reset(new PosixFileSystem());
  1231. return f->get();
  1232. });
  1233. } // namespace ROCKSDB_NAMESPACE
  1234. #endif