io_posix.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #pragma once
  10. #include <errno.h>
  11. #if defined(ROCKSDB_IOURING_PRESENT)
  12. #include <liburing.h>
  13. #include <sys/uio.h>
  14. #endif
  15. #include <unistd.h>
  16. #include <atomic>
  17. #include <string>
  18. #include "rocksdb/env.h"
  19. #include "util/thread_local.h"
  20. #include "rocksdb/file_system.h"
  21. #include "rocksdb/io_status.h"
  22. // For non linux platform, the following macros are used only as place
  23. // holder.
  24. #if !(defined OS_LINUX) && !(defined CYGWIN) && !(defined OS_AIX)
  25. #define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
  26. #define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
  27. #define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
  28. #define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */
  29. #define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */
  30. #endif
  31. namespace ROCKSDB_NAMESPACE {
  32. static std::string IOErrorMsg(const std::string& context,
  33. const std::string& file_name) {
  34. if (file_name.empty()) {
  35. return context;
  36. }
  37. return context + ": " + file_name;
  38. }
  39. // file_name can be left empty if it is not unkown.
  40. static IOStatus IOError(const std::string& context,
  41. const std::string& file_name, int err_number) {
  42. switch (err_number) {
  43. case ENOSPC: {
  44. IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name),
  45. strerror(err_number));
  46. s.SetRetryable(true);
  47. return s;
  48. }
  49. case ESTALE:
  50. return IOStatus::IOError(IOStatus::kStaleFile);
  51. case ENOENT:
  52. return IOStatus::PathNotFound(IOErrorMsg(context, file_name),
  53. strerror(err_number));
  54. default:
  55. return IOStatus::IOError(IOErrorMsg(context, file_name),
  56. strerror(err_number));
  57. }
  58. }
  59. class PosixHelper {
  60. public:
  61. static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size);
  62. };
  63. class PosixSequentialFile : public FSSequentialFile {
  64. private:
  65. std::string filename_;
  66. FILE* file_;
  67. int fd_;
  68. bool use_direct_io_;
  69. size_t logical_sector_size_;
  70. public:
  71. PosixSequentialFile(const std::string& fname, FILE* file, int fd,
  72. const EnvOptions& options);
  73. virtual ~PosixSequentialFile();
  74. virtual IOStatus Read(size_t n, const IOOptions& opts, Slice* result,
  75. char* scratch, IODebugContext* dbg) override;
  76. virtual IOStatus PositionedRead(uint64_t offset, size_t n,
  77. const IOOptions& opts, Slice* result,
  78. char* scratch, IODebugContext* dbg) override;
  79. virtual IOStatus Skip(uint64_t n) override;
  80. virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
  81. virtual bool use_direct_io() const override { return use_direct_io_; }
  82. virtual size_t GetRequiredBufferAlignment() const override {
  83. return logical_sector_size_;
  84. }
  85. };
  86. #if defined(ROCKSDB_IOURING_PRESENT)
  87. // io_uring instance queue depth
  88. const unsigned int kIoUringDepth = 256;
  89. inline void DeleteIOUring(void* p) {
  90. struct io_uring* iu = static_cast<struct io_uring*>(p);
  91. delete iu;
  92. }
  93. inline struct io_uring* CreateIOUring() {
  94. struct io_uring* new_io_uring = new struct io_uring;
  95. int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, 0);
  96. if (ret) {
  97. delete new_io_uring;
  98. new_io_uring = nullptr;
  99. }
  100. return new_io_uring;
  101. }
  102. #endif // defined(ROCKSDB_IOURING_PRESENT)
  103. class PosixRandomAccessFile : public FSRandomAccessFile {
  104. protected:
  105. std::string filename_;
  106. int fd_;
  107. bool use_direct_io_;
  108. size_t logical_sector_size_;
  109. #if defined(ROCKSDB_IOURING_PRESENT)
  110. ThreadLocalPtr* thread_local_io_urings_;
  111. #endif
  112. public:
  113. PosixRandomAccessFile(const std::string& fname, int fd,
  114. const EnvOptions& options
  115. #if defined(ROCKSDB_IOURING_PRESENT)
  116. ,
  117. ThreadLocalPtr* thread_local_io_urings
  118. #endif
  119. );
  120. virtual ~PosixRandomAccessFile();
  121. virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts,
  122. Slice* result, char* scratch,
  123. IODebugContext* dbg) const override;
  124. virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
  125. const IOOptions& options,
  126. IODebugContext* dbg) override;
  127. virtual IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& opts,
  128. IODebugContext* dbg) override;
  129. #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
  130. virtual size_t GetUniqueId(char* id, size_t max_size) const override;
  131. #endif
  132. virtual void Hint(AccessPattern pattern) override;
  133. virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
  134. virtual bool use_direct_io() const override { return use_direct_io_; }
  135. virtual size_t GetRequiredBufferAlignment() const override {
  136. return logical_sector_size_;
  137. }
  138. };
  139. class PosixWritableFile : public FSWritableFile {
  140. protected:
  141. const std::string filename_;
  142. const bool use_direct_io_;
  143. int fd_;
  144. uint64_t filesize_;
  145. size_t logical_sector_size_;
  146. #ifdef ROCKSDB_FALLOCATE_PRESENT
  147. bool allow_fallocate_;
  148. bool fallocate_with_keep_size_;
  149. #endif
  150. #ifdef ROCKSDB_RANGESYNC_PRESENT
  151. // Even if the syscall is present, the filesystem may still not properly
  152. // support it, so we need to do a dynamic check too.
  153. bool sync_file_range_supported_;
  154. #endif // ROCKSDB_RANGESYNC_PRESENT
  155. public:
  156. explicit PosixWritableFile(const std::string& fname, int fd,
  157. const EnvOptions& options);
  158. virtual ~PosixWritableFile();
  159. // Need to implement this so the file is truncated correctly
  160. // with direct I/O
  161. virtual IOStatus Truncate(uint64_t size, const IOOptions& opts,
  162. IODebugContext* dbg) override;
  163. virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
  164. virtual IOStatus Append(const Slice& data, const IOOptions& opts,
  165. IODebugContext* dbg) override;
  166. virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset,
  167. const IOOptions& opts,
  168. IODebugContext* dbg) override;
  169. virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
  170. virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
  171. virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
  172. virtual bool IsSyncThreadSafe() const override;
  173. virtual bool use_direct_io() const override { return use_direct_io_; }
  174. virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override;
  175. virtual uint64_t GetFileSize(const IOOptions& opts,
  176. IODebugContext* dbg) override;
  177. virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
  178. virtual size_t GetRequiredBufferAlignment() const override {
  179. return logical_sector_size_;
  180. }
  181. #ifdef ROCKSDB_FALLOCATE_PRESENT
  182. virtual IOStatus Allocate(uint64_t offset, uint64_t len,
  183. const IOOptions& opts,
  184. IODebugContext* dbg) override;
  185. #endif
  186. virtual IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
  187. const IOOptions& opts,
  188. IODebugContext* dbg) override;
  189. #ifdef OS_LINUX
  190. virtual size_t GetUniqueId(char* id, size_t max_size) const override;
  191. #endif
  192. };
  193. // mmap() based random-access
  194. class PosixMmapReadableFile : public FSRandomAccessFile {
  195. private:
  196. int fd_;
  197. std::string filename_;
  198. void* mmapped_region_;
  199. size_t length_;
  200. public:
  201. PosixMmapReadableFile(const int fd, const std::string& fname, void* base,
  202. size_t length, const EnvOptions& options);
  203. virtual ~PosixMmapReadableFile();
  204. virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts,
  205. Slice* result, char* scratch,
  206. IODebugContext* dbg) const override;
  207. virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
  208. };
  209. class PosixMmapFile : public FSWritableFile {
  210. private:
  211. std::string filename_;
  212. int fd_;
  213. size_t page_size_;
  214. size_t map_size_; // How much extra memory to map at a time
  215. char* base_; // The mapped region
  216. char* limit_; // Limit of the mapped region
  217. char* dst_; // Where to write next (in range [base_,limit_])
  218. char* last_sync_; // Where have we synced up to
  219. uint64_t file_offset_; // Offset of base_ in file
  220. #ifdef ROCKSDB_FALLOCATE_PRESENT
  221. bool allow_fallocate_; // If false, fallocate calls are bypassed
  222. bool fallocate_with_keep_size_;
  223. #endif
  224. // Roundup x to a multiple of y
  225. static size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
  226. size_t TruncateToPageBoundary(size_t s) {
  227. s -= (s & (page_size_ - 1));
  228. assert((s % page_size_) == 0);
  229. return s;
  230. }
  231. IOStatus MapNewRegion();
  232. IOStatus UnmapCurrentRegion();
  233. IOStatus Msync();
  234. public:
  235. PosixMmapFile(const std::string& fname, int fd, size_t page_size,
  236. const EnvOptions& options);
  237. ~PosixMmapFile();
  238. // Means Close() will properly take care of truncate
  239. // and it does not need any additional information
  240. virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*opts*/,
  241. IODebugContext* /*dbg*/) override {
  242. return IOStatus::OK();
  243. }
  244. virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
  245. virtual IOStatus Append(const Slice& data, const IOOptions& opts,
  246. IODebugContext* dbg) override;
  247. virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
  248. virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
  249. virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
  250. virtual uint64_t GetFileSize(const IOOptions& opts,
  251. IODebugContext* dbg) override;
  252. virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
  253. #ifdef ROCKSDB_FALLOCATE_PRESENT
  254. virtual IOStatus Allocate(uint64_t offset, uint64_t len,
  255. const IOOptions& opts,
  256. IODebugContext* dbg) override;
  257. #endif
  258. };
  259. class PosixRandomRWFile : public FSRandomRWFile {
  260. public:
  261. explicit PosixRandomRWFile(const std::string& fname, int fd,
  262. const EnvOptions& options);
  263. virtual ~PosixRandomRWFile();
  264. virtual IOStatus Write(uint64_t offset, const Slice& data,
  265. const IOOptions& opts, IODebugContext* dbg) override;
  266. virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts,
  267. Slice* result, char* scratch,
  268. IODebugContext* dbg) const override;
  269. virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
  270. virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
  271. virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
  272. virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
  273. private:
  274. const std::string filename_;
  275. int fd_;
  276. };
  277. struct PosixMemoryMappedFileBuffer : public MemoryMappedFileBuffer {
  278. PosixMemoryMappedFileBuffer(void* _base, size_t _length)
  279. : MemoryMappedFileBuffer(_base, _length) {}
  280. virtual ~PosixMemoryMappedFileBuffer();
  281. };
  282. class PosixDirectory : public FSDirectory {
  283. public:
  284. explicit PosixDirectory(int fd) : fd_(fd) {}
  285. ~PosixDirectory();
  286. virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
  287. private:
  288. int fd_;
  289. };
  290. } // namespace ROCKSDB_NAMESPACE