io_posix.cc 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #ifdef ROCKSDB_LIB_IO_POSIX
  10. #include "env/io_posix.h"
  11. #include <fcntl.h>
  12. #include <algorithm>
  13. #include <cerrno>
  14. #if defined(OS_LINUX)
  15. #include <linux/fs.h>
  16. #ifndef FALLOC_FL_KEEP_SIZE
  17. #include <linux/falloc.h>
  18. #endif
  19. #endif
  20. #include <sys/ioctl.h>
  21. #include <sys/mman.h>
  22. #include <sys/stat.h>
  23. #include <sys/types.h>
  24. #include <cstdio>
  25. #include <cstdlib>
  26. #include <cstring>
  27. #if defined(OS_LINUX) || defined(OS_ANDROID)
  28. #include <sys/statfs.h>
  29. #include <sys/sysmacros.h>
  30. #endif
  31. #include "monitoring/iostats_context_imp.h"
  32. #include "port/port.h"
  33. #include "port/stack_trace.h"
  34. #include "rocksdb/slice.h"
  35. #include "test_util/sync_point.h"
  36. #include "util/autovector.h"
  37. #include "util/coding.h"
  38. #include "util/string_util.h"
  39. #if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
  40. #define F_LINUX_SPECIFIC_BASE 1024
  41. #define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
  42. #endif
  43. namespace ROCKSDB_NAMESPACE {
  44. std::string IOErrorMsg(const std::string& context,
  45. const std::string& file_name) {
  46. if (file_name.empty()) {
  47. return context;
  48. }
  49. return context + ": " + file_name;
  50. }
  51. // file_name can be left empty if it is not unkown.
  52. IOStatus IOError(const std::string& context, const std::string& file_name,
  53. int err_number) {
  54. switch (err_number) {
  55. case ENOSPC: {
  56. IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name),
  57. errnoStr(err_number).c_str());
  58. s.SetRetryable(true);
  59. return s;
  60. }
  61. case ESTALE:
  62. return IOStatus::IOError(IOStatus::kStaleFile);
  63. case ENOENT:
  64. return IOStatus::PathNotFound(IOErrorMsg(context, file_name),
  65. errnoStr(err_number).c_str());
  66. default:
  67. return IOStatus::IOError(IOErrorMsg(context, file_name),
  68. errnoStr(err_number).c_str());
  69. }
  70. }
  71. // A wrapper for fadvise, if the platform doesn't support fadvise,
  72. // it will simply return 0.
  73. int Fadvise(int fd, off_t offset, size_t len, int advice) {
  74. #ifdef OS_LINUX
  75. return posix_fadvise(fd, offset, len, advice);
  76. #else
  77. (void)fd;
  78. (void)offset;
  79. (void)len;
  80. (void)advice;
  81. return 0; // simply do nothing.
  82. #endif
  83. }
  84. // A wrapper for fadvise, if the platform doesn't support fadvise,
  85. // it will simply return 0.
  86. int Madvise(void* addr, size_t len, int advice) {
  87. #ifdef OS_LINUX
  88. return posix_madvise(addr, len, advice);
  89. #else
  90. (void)addr;
  91. (void)len;
  92. (void)advice;
  93. return 0; // simply do nothing.
  94. #endif
  95. }
  96. namespace {
  97. // On MacOS (and probably *BSD), the posix write and pwrite calls do not support
  98. // buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
  99. // cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
  100. // the writes aligned.
  101. bool PosixWrite(int fd, const char* buf, size_t nbyte) {
  102. const size_t kLimit1Gb = 1UL << 30;
  103. const char* src = buf;
  104. size_t left = nbyte;
  105. while (left != 0) {
  106. size_t bytes_to_write = std::min(left, kLimit1Gb);
  107. ssize_t done = write(fd, src, bytes_to_write);
  108. if (done < 0) {
  109. if (errno == EINTR) {
  110. continue;
  111. }
  112. return false;
  113. }
  114. left -= done;
  115. src += done;
  116. }
  117. return true;
  118. }
  119. bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
  120. const size_t kLimit1Gb = 1UL << 30;
  121. const char* src = buf;
  122. size_t left = nbyte;
  123. while (left != 0) {
  124. size_t bytes_to_write = std::min(left, kLimit1Gb);
  125. ssize_t done = pwrite(fd, src, bytes_to_write, offset);
  126. if (done < 0) {
  127. if (errno == EINTR) {
  128. continue;
  129. }
  130. return false;
  131. }
  132. left -= done;
  133. offset += done;
  134. src += done;
  135. }
  136. return true;
  137. }
  138. #ifdef ROCKSDB_RANGESYNC_PRESENT
  139. #if !defined(ZFS_SUPER_MAGIC)
  140. // The magic number for ZFS was not exposed until recently. It should be fixed
  141. // forever so we can just copy the magic number here.
  142. #define ZFS_SUPER_MAGIC 0x2fc12fc1
  143. #endif
  144. bool IsSyncFileRangeSupported(int fd) {
  145. // This function tracks and checks for cases where we know `sync_file_range`
  146. // definitely will not work properly despite passing the compile-time check
  147. // (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks
  148. // fail in unexpected ways, we allow `sync_file_range` to be used. This way
  149. // should minimize risk of impacting existing use cases.
  150. struct statfs buf;
  151. int ret = fstatfs(fd, &buf);
  152. assert(ret == 0);
  153. if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) {
  154. // Testing on ZFS showed the writeback did not happen asynchronously when
  155. // `sync_file_range` was called, even though it returned success. Avoid it
  156. // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
  157. // even though this'll incur extra I/O for metadata.
  158. return false;
  159. }
  160. ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */);
  161. assert(!(ret == -1 && errno != ENOSYS));
  162. if (ret == -1 && errno == ENOSYS) {
  163. // `sync_file_range` is not implemented on all platforms even if
  164. // compile-time checks pass and a supported filesystem is in-use. For
  165. // example, using ext4 on WSL (Windows Subsystem for Linux),
  166. // `sync_file_range()` returns `ENOSYS`
  167. // ("Function not implemented").
  168. return false;
  169. }
  170. // None of the known cases matched, so allow `sync_file_range` use.
  171. return true;
  172. }
  173. #undef ZFS_SUPER_MAGIC
  174. #endif // ROCKSDB_RANGESYNC_PRESENT
  175. } // anonymous namespace
  176. /*
  177. * PosixSequentialFile
  178. */
  179. PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file,
  180. int fd, size_t logical_block_size,
  181. const EnvOptions& options)
  182. : filename_(fname),
  183. file_(file),
  184. fd_(fd),
  185. use_direct_io_(options.use_direct_reads),
  186. logical_sector_size_(logical_block_size) {
  187. assert(!options.use_direct_reads || !options.use_mmap_reads);
  188. }
  189. PosixSequentialFile::~PosixSequentialFile() {
  190. if (!use_direct_io()) {
  191. assert(file_);
  192. fclose(file_);
  193. } else {
  194. assert(fd_);
  195. close(fd_);
  196. }
  197. }
  198. IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
  199. Slice* result, char* scratch,
  200. IODebugContext* /*dbg*/) {
  201. assert(result != nullptr && !use_direct_io());
  202. IOStatus s;
  203. size_t r = 0;
  204. do {
  205. clearerr(file_);
  206. r = fread_unlocked(scratch, 1, n, file_);
  207. } while (r == 0 && ferror(file_) && errno == EINTR);
  208. *result = Slice(scratch, r);
  209. if (r < n) {
  210. if (feof(file_)) {
  211. // We leave status as ok if we hit the end of the file
  212. // We also clear the error so that the reads can continue
  213. // if a new data is written to the file
  214. clearerr(file_);
  215. } else {
  216. // A partial read with an error: return a non-ok status
  217. s = IOError("While reading file sequentially", filename_, errno);
  218. }
  219. }
  220. return s;
  221. }
  222. IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n,
  223. const IOOptions& /*opts*/,
  224. Slice* result, char* scratch,
  225. IODebugContext* /*dbg*/) {
  226. assert(use_direct_io());
  227. assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
  228. assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
  229. assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
  230. IOStatus s;
  231. ssize_t r = -1;
  232. size_t left = n;
  233. char* ptr = scratch;
  234. while (left > 0) {
  235. r = pread(fd_, ptr, left, static_cast<off_t>(offset));
  236. if (r <= 0) {
  237. if (r == -1 && errno == EINTR) {
  238. continue;
  239. }
  240. break;
  241. }
  242. ptr += r;
  243. offset += r;
  244. left -= r;
  245. if (!IsSectorAligned(r, GetRequiredBufferAlignment())) {
  246. // Bytes reads don't fill sectors. Should only happen at the end
  247. // of the file.
  248. break;
  249. }
  250. }
  251. if (r < 0) {
  252. // An error: return a non-ok status
  253. s = IOError("While pread " + std::to_string(n) + " bytes from offset " +
  254. std::to_string(offset),
  255. filename_, errno);
  256. }
  257. *result = Slice(scratch, (r < 0) ? 0 : n - left);
  258. return s;
  259. }
  260. IOStatus PosixSequentialFile::Skip(uint64_t n) {
  261. if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
  262. return IOError("While fseek to skip " + std::to_string(n) + " bytes",
  263. filename_, errno);
  264. }
  265. return IOStatus::OK();
  266. }
  267. IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
  268. #ifndef OS_LINUX
  269. (void)offset;
  270. (void)length;
  271. return IOStatus::OK();
  272. #else
  273. if (!use_direct_io()) {
  274. // free OS pages
  275. int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
  276. if (ret != 0) {
  277. return IOError("While fadvise NotNeeded offset " +
  278. std::to_string(offset) + " len " +
  279. std::to_string(length),
  280. filename_, errno);
  281. }
  282. }
  283. return IOStatus::OK();
  284. #endif
  285. }
  286. /*
  287. * PosixRandomAccessFile
  288. */
  289. #if defined(OS_LINUX)
  290. size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
  291. if (max_size < kMaxVarint64Length * 3) {
  292. return 0;
  293. }
  294. struct stat buf;
  295. int result = fstat(fd, &buf);
  296. if (result == -1) {
  297. return 0;
  298. }
  299. long version = 0;
  300. result = ioctl(fd, FS_IOC_GETVERSION, &version);
  301. TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
  302. if (result == -1) {
  303. return 0;
  304. }
  305. uint64_t uversion = (uint64_t)version;
  306. char* rid = id;
  307. rid = EncodeVarint64(rid, buf.st_dev);
  308. rid = EncodeVarint64(rid, buf.st_ino);
  309. rid = EncodeVarint64(rid, uversion);
  310. assert(rid >= id);
  311. return static_cast<size_t>(rid - id);
  312. }
  313. #endif
  314. #if defined(OS_MACOSX) || defined(OS_AIX)
  315. size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
  316. if (max_size < kMaxVarint64Length * 3) {
  317. return 0;
  318. }
  319. struct stat buf;
  320. int result = fstat(fd, &buf);
  321. if (result == -1) {
  322. return 0;
  323. }
  324. char* rid = id;
  325. rid = EncodeVarint64(rid, buf.st_dev);
  326. rid = EncodeVarint64(rid, buf.st_ino);
  327. rid = EncodeVarint64(rid, buf.st_gen);
  328. assert(rid >= id);
  329. return static_cast<size_t>(rid - id);
  330. }
  331. #endif
  332. #ifdef OS_LINUX
  333. std::string RemoveTrailingSlash(const std::string& path) {
  334. std::string p = path;
  335. if (p.size() > 1 && p.back() == '/') {
  336. p.pop_back();
  337. }
  338. return p;
  339. }
  340. Status LogicalBlockSizeCache::RefAndCacheLogicalBlockSize(
  341. const std::vector<std::string>& directories) {
  342. std::vector<std::string> dirs;
  343. dirs.reserve(directories.size());
  344. for (auto& d : directories) {
  345. dirs.emplace_back(RemoveTrailingSlash(d));
  346. }
  347. std::map<std::string, size_t> dir_sizes;
  348. {
  349. ReadLock lock(&cache_mutex_);
  350. for (const auto& dir : dirs) {
  351. if (cache_.find(dir) == cache_.end()) {
  352. dir_sizes.emplace(dir, 0);
  353. }
  354. }
  355. }
  356. Status s;
  357. for (auto& dir_size : dir_sizes) {
  358. s = get_logical_block_size_of_directory_(dir_size.first, &dir_size.second);
  359. if (!s.ok()) {
  360. return s;
  361. }
  362. }
  363. WriteLock lock(&cache_mutex_);
  364. for (const auto& dir : dirs) {
  365. auto& v = cache_[dir];
  366. v.ref++;
  367. auto dir_size = dir_sizes.find(dir);
  368. if (dir_size != dir_sizes.end()) {
  369. v.size = dir_size->second;
  370. }
  371. }
  372. return s;
  373. }
  374. void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize(
  375. const std::vector<std::string>& directories) {
  376. std::vector<std::string> dirs;
  377. dirs.reserve(directories.size());
  378. for (auto& dir : directories) {
  379. dirs.emplace_back(RemoveTrailingSlash(dir));
  380. }
  381. WriteLock lock(&cache_mutex_);
  382. for (const auto& dir : dirs) {
  383. auto it = cache_.find(dir);
  384. if (it != cache_.end() && !(--(it->second.ref))) {
  385. cache_.erase(it);
  386. }
  387. }
  388. }
  389. size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname,
  390. int fd) {
  391. std::string dir = fname.substr(0, fname.find_last_of('/'));
  392. if (dir.empty()) {
  393. dir = "/";
  394. }
  395. {
  396. ReadLock lock(&cache_mutex_);
  397. auto it = cache_.find(dir);
  398. if (it != cache_.end()) {
  399. return it->second.size;
  400. }
  401. }
  402. return get_logical_block_size_of_fd_(fd);
  403. }
  404. #endif
  405. Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory,
  406. size_t* size) {
  407. return GetQueueSysfsFileValueofDirectory(directory,
  408. GetLogicalBlockSizeFileName(), size);
  409. }
  410. Status PosixHelper::GetMaxSectorsKBOfDirectory(const std::string& directory,
  411. size_t* kb) {
  412. return GetQueueSysfsFileValueofDirectory(directory, GetMaxSectorsKBFileName(),
  413. kb);
  414. }
  415. Status PosixHelper::GetQueueSysfsFileValueofDirectory(
  416. const std::string& directory, const std::string& file_name, size_t* value) {
  417. int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY);
  418. if (fd == -1) {
  419. return Status::IOError("Cannot open directory " + directory);
  420. }
  421. if (file_name == PosixHelper::GetLogicalBlockSizeFileName()) {
  422. *value = PosixHelper::GetLogicalBlockSizeOfFd(fd);
  423. } else if (file_name == PosixHelper::GetMaxSectorsKBFileName()) {
  424. *value = PosixHelper::GetMaxSectorsKBOfFd(fd);
  425. } else {
  426. assert(false);
  427. }
  428. close(fd);
  429. return Status::OK();
  430. }
  431. size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
  432. return GetQueueSysfsFileValueOfFd(fd, GetLogicalBlockSizeFileName(),
  433. kDefaultPageSize);
  434. }
  435. size_t PosixHelper::GetMaxSectorsKBOfFd(int fd) {
  436. return GetQueueSysfsFileValueOfFd(fd, GetMaxSectorsKBFileName(),
  437. kDefaultMaxSectorsKB);
  438. }
  439. size_t PosixHelper::GetQueueSysfsFileValueOfFd(
  440. int fd, const std::string& file_name, const size_t default_return_value) {
  441. #ifdef OS_LINUX
  442. struct stat buf;
  443. int result = fstat(fd, &buf);
  444. if (result == -1) {
  445. return default_return_value;
  446. }
  447. // Get device number
  448. if (major(buf.st_dev) == 0) {
  449. // Unnamed devices (e.g. non-device mounts), reserved as null device number.
  450. // These don't have an entry in /sys/dev/block/. Return a sensible default.
  451. return default_return_value;
  452. }
  453. // Get device path
  454. const int kBufferSize = 100;
  455. char path[kBufferSize];
  456. char real_path[PATH_MAX + 1];
  457. snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
  458. minor(buf.st_dev));
  459. if (realpath(path, real_path) == nullptr) {
  460. return default_return_value;
  461. }
  462. std::string device_dir(real_path);
  463. // Get the queue sysfs file path
  464. if (!device_dir.empty() && device_dir.back() == '/') {
  465. device_dir.pop_back();
  466. }
  467. // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
  468. // and nvme0n1 have it.
  469. // $ ls -al '/sys/dev/block/8:3'
  470. // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
  471. // ../../block/sda/sda3
  472. // $ ls -al '/sys/dev/block/259:4'
  473. // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
  474. // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
  475. size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
  476. if (parent_end == std::string::npos) {
  477. return default_return_value;
  478. }
  479. size_t parent_begin = device_dir.rfind('/', parent_end - 1);
  480. if (parent_begin == std::string::npos) {
  481. return default_return_value;
  482. }
  483. std::string parent =
  484. device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
  485. std::string child = device_dir.substr(parent_end + 1, std::string::npos);
  486. if (parent != "block" &&
  487. (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
  488. device_dir = device_dir.substr(0, parent_end);
  489. }
  490. std::string fname = device_dir + "/queue/" + file_name;
  491. // Get value in the queue sysfs file
  492. FILE* fp;
  493. size_t value = 0;
  494. fp = fopen(fname.c_str(), "r");
  495. if (fp != nullptr) {
  496. char* line = nullptr;
  497. size_t len = 0;
  498. if (getline(&line, &len, fp) != -1) {
  499. sscanf(line, "%zu", &value);
  500. }
  501. free(line);
  502. fclose(fp);
  503. }
  504. if (file_name == GetLogicalBlockSizeFileName()) {
  505. if (value != 0 && (value & (value - 1)) == 0) {
  506. return value;
  507. }
  508. } else if (file_name == GetMaxSectorsKBFileName()) {
  509. if (value != 0) {
  510. return value;
  511. }
  512. } else {
  513. assert(false);
  514. }
  515. #endif
  516. (void)fd;
  517. (void)file_name;
  518. return default_return_value;
  519. }
  520. /*
  521. * PosixRandomAccessFile
  522. *
  523. * pread() based random-access
  524. */
  525. PosixRandomAccessFile::PosixRandomAccessFile(
  526. const std::string& fname, int fd, size_t logical_block_size,
  527. const EnvOptions& options
  528. #if defined(ROCKSDB_IOURING_PRESENT)
  529. ,
  530. ThreadLocalPtr* thread_local_io_urings
  531. #endif
  532. )
  533. : filename_(fname),
  534. fd_(fd),
  535. use_direct_io_(options.use_direct_reads),
  536. logical_sector_size_(logical_block_size)
  537. #if defined(ROCKSDB_IOURING_PRESENT)
  538. ,
  539. thread_local_io_urings_(thread_local_io_urings)
  540. #endif
  541. {
  542. assert(!options.use_direct_reads || !options.use_mmap_reads);
  543. assert(!options.use_mmap_reads);
  544. }
  545. PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
  546. IOStatus PosixRandomAccessFile::GetFileSize(uint64_t* result) {
  547. struct stat sbuf {};
  548. if (fstat(fd_, &sbuf) != 0) {
  549. *result = 0;
  550. return IOError("While fstat with fd " + std::to_string(fd_), filename_,
  551. errno);
  552. }
  553. *result = sbuf.st_size;
  554. return IOStatus::OK();
  555. }
  556. IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
  557. const IOOptions& /*opts*/, Slice* result,
  558. char* scratch,
  559. IODebugContext* /*dbg*/) const {
  560. if (use_direct_io()) {
  561. assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
  562. assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
  563. assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
  564. }
  565. IOStatus s;
  566. ssize_t r = -1;
  567. size_t left = n;
  568. char* ptr = scratch;
  569. while (left > 0) {
  570. r = pread(fd_, ptr, left, static_cast<off_t>(offset));
  571. if (r <= 0) {
  572. if (r == -1 && errno == EINTR) {
  573. continue;
  574. }
  575. break;
  576. }
  577. ptr += r;
  578. offset += r;
  579. left -= r;
  580. if (use_direct_io() &&
  581. r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
  582. // Bytes reads don't fill sectors. Should only happen at the end
  583. // of the file.
  584. break;
  585. }
  586. }
  587. if (r < 0) {
  588. // An error: return a non-ok status
  589. s = IOError("While pread offset " + std::to_string(offset) + " len " +
  590. std::to_string(n),
  591. filename_, errno);
  592. }
  593. *result = Slice(scratch, (r < 0) ? 0 : n - left);
  594. return s;
  595. }
  596. IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
  597. const IOOptions& options,
  598. IODebugContext* dbg) {
  599. if (use_direct_io()) {
  600. for (size_t i = 0; i < num_reqs; i++) {
  601. assert(IsSectorAligned(reqs[i].offset, GetRequiredBufferAlignment()));
  602. assert(IsSectorAligned(reqs[i].len, GetRequiredBufferAlignment()));
  603. assert(IsSectorAligned(reqs[i].scratch, GetRequiredBufferAlignment()));
  604. }
  605. }
  606. #if defined(ROCKSDB_IOURING_PRESENT)
  607. struct io_uring* iu = nullptr;
  608. if (thread_local_io_urings_) {
  609. iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
  610. if (iu == nullptr) {
  611. iu = CreateIOUring();
  612. if (iu != nullptr) {
  613. thread_local_io_urings_->Reset(iu);
  614. }
  615. }
  616. }
  617. // Init failed, platform doesn't support io_uring. Fall back to
  618. // serialized reads
  619. if (iu == nullptr) {
  620. return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
  621. }
  622. IOStatus ios = IOStatus::OK();
  623. struct WrappedReadRequest {
  624. FSReadRequest* req;
  625. struct iovec iov;
  626. size_t finished_len;
  627. explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {}
  628. };
  629. autovector<WrappedReadRequest, 32> req_wraps;
  630. autovector<WrappedReadRequest*, 4> incomplete_rq_list;
  631. std::unordered_set<WrappedReadRequest*> wrap_cache;
  632. for (size_t i = 0; i < num_reqs; i++) {
  633. req_wraps.emplace_back(&reqs[i]);
  634. }
  635. size_t reqs_off = 0;
  636. while (num_reqs > reqs_off || !incomplete_rq_list.empty()) {
  637. size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size();
  638. // If requests exceed depth, split it into batches
  639. if (this_reqs > kIoUringDepth) {
  640. this_reqs = kIoUringDepth;
  641. }
  642. assert(incomplete_rq_list.size() <= this_reqs);
  643. for (size_t i = 0; i < this_reqs; i++) {
  644. WrappedReadRequest* rep_to_submit;
  645. if (i < incomplete_rq_list.size()) {
  646. rep_to_submit = incomplete_rq_list[i];
  647. } else {
  648. rep_to_submit = &req_wraps[reqs_off++];
  649. }
  650. assert(rep_to_submit->req->len > rep_to_submit->finished_len);
  651. rep_to_submit->iov.iov_base =
  652. rep_to_submit->req->scratch + rep_to_submit->finished_len;
  653. rep_to_submit->iov.iov_len =
  654. rep_to_submit->req->len - rep_to_submit->finished_len;
  655. struct io_uring_sqe* sqe;
  656. sqe = io_uring_get_sqe(iu);
  657. io_uring_prep_readv(
  658. sqe, fd_, &rep_to_submit->iov, 1,
  659. rep_to_submit->req->offset + rep_to_submit->finished_len);
  660. io_uring_sqe_set_data(sqe, rep_to_submit);
  661. wrap_cache.emplace(rep_to_submit);
  662. }
  663. incomplete_rq_list.clear();
  664. ssize_t ret =
  665. io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
  666. TEST_SYNC_POINT_CALLBACK(
  667. "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
  668. &ret);
  669. TEST_SYNC_POINT_CALLBACK(
  670. "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
  671. iu);
  672. if (static_cast<size_t>(ret) != this_reqs) {
  673. fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
  674. // If error happens and we submitted fewer than expected, it is an
  675. // exception case and we don't retry here. We should still consume
  676. // what is is submitted in the ring.
  677. for (ssize_t i = 0; i < ret; i++) {
  678. struct io_uring_cqe* cqe = nullptr;
  679. io_uring_wait_cqe(iu, &cqe);
  680. if (cqe != nullptr) {
  681. io_uring_cqe_seen(iu, cqe);
  682. }
  683. }
  684. return IOStatus::IOError("io_uring_submit_and_wait() requested " +
  685. std::to_string(this_reqs) + " but returned " +
  686. std::to_string(ret));
  687. }
  688. for (size_t i = 0; i < this_reqs; i++) {
  689. struct io_uring_cqe* cqe = nullptr;
  690. WrappedReadRequest* req_wrap;
  691. // We could use the peek variant here, but this seems safer in terms
  692. // of our initial wait not reaping all completions
  693. ret = io_uring_wait_cqe(iu, &cqe);
  694. TEST_SYNC_POINT_CALLBACK(
  695. "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret);
  696. if (ret) {
  697. ios = IOStatus::IOError("io_uring_wait_cqe() returns " +
  698. std::to_string(ret));
  699. if (cqe != nullptr) {
  700. io_uring_cqe_seen(iu, cqe);
  701. }
  702. continue;
  703. }
  704. req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
  705. // Reset cqe data to catch any stray reuse of it
  706. static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
  707. // Check that we got a valid unique cqe data
  708. auto wrap_check = wrap_cache.find(req_wrap);
  709. if (wrap_check == wrap_cache.end()) {
  710. fprintf(stderr,
  711. "PosixRandomAccessFile::MultiRead: "
  712. "Bad cqe data from IO uring - %p\n",
  713. req_wrap);
  714. port::PrintStack();
  715. ios = IOStatus::IOError("io_uring_cqe_get_data() returned " +
  716. std::to_string((uint64_t)req_wrap));
  717. continue;
  718. }
  719. wrap_cache.erase(wrap_check);
  720. FSReadRequest* req = req_wrap->req;
  721. size_t bytes_read = 0;
  722. bool read_again = false;
  723. UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len,
  724. false /*async_read*/, use_direct_io(),
  725. GetRequiredBufferAlignment(), req_wrap->finished_len, req,
  726. bytes_read, read_again);
  727. int32_t res = cqe->res;
  728. if (res >= 0) {
  729. if (bytes_read == 0) {
  730. if (read_again) {
  731. Slice tmp_slice;
  732. req->status =
  733. Read(req->offset + req_wrap->finished_len,
  734. req->len - req_wrap->finished_len, options, &tmp_slice,
  735. req->scratch + req_wrap->finished_len, dbg);
  736. req->result =
  737. Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
  738. }
  739. // else It means EOF so no need to do anything.
  740. } else if (bytes_read < req_wrap->iov.iov_len) {
  741. incomplete_rq_list.push_back(req_wrap);
  742. }
  743. }
  744. io_uring_cqe_seen(iu, cqe);
  745. }
  746. wrap_cache.clear();
  747. }
  748. return ios;
  749. #else
  750. return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
  751. #endif
  752. }
  753. IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n,
  754. const IOOptions& /*opts*/,
  755. IODebugContext* /*dbg*/) {
  756. IOStatus s;
  757. if (!use_direct_io()) {
  758. ssize_t r = 0;
  759. #ifdef OS_LINUX
  760. r = readahead(fd_, offset, n);
  761. #endif
  762. #ifdef OS_MACOSX
  763. radvisory advice;
  764. advice.ra_offset = static_cast<off_t>(offset);
  765. advice.ra_count = static_cast<int>(n);
  766. r = fcntl(fd_, F_RDADVISE, &advice);
  767. #endif
  768. if (r == -1) {
  769. s = IOError("While prefetching offset " + std::to_string(offset) +
  770. " len " + std::to_string(n),
  771. filename_, errno);
  772. }
  773. }
  774. return s;
  775. }
  776. #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
  777. size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
  778. return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
  779. }
  780. #endif
  781. void PosixRandomAccessFile::Hint(AccessPattern pattern) {
  782. if (use_direct_io()) {
  783. return;
  784. }
  785. switch (pattern) {
  786. case kNormal:
  787. Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
  788. break;
  789. case kRandom:
  790. Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
  791. break;
  792. case kSequential:
  793. Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
  794. break;
  795. case kWillNeed:
  796. Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
  797. break;
  798. case kWontNeed:
  799. Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
  800. break;
  801. default:
  802. assert(false);
  803. break;
  804. }
  805. }
  806. IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
  807. if (use_direct_io()) {
  808. return IOStatus::OK();
  809. }
  810. #ifndef OS_LINUX
  811. (void)offset;
  812. (void)length;
  813. return IOStatus::OK();
  814. #else
  815. // free OS pages
  816. int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
  817. if (ret == 0) {
  818. return IOStatus::OK();
  819. }
  820. return IOError("While fadvise NotNeeded offset " + std::to_string(offset) +
  821. " len " + std::to_string(length),
  822. filename_, errno);
  823. #endif
  824. }
  825. IOStatus PosixRandomAccessFile::ReadAsync(
  826. FSReadRequest& req, const IOOptions& /*opts*/,
  827. std::function<void(FSReadRequest&, void*)> cb, void* cb_arg,
  828. void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) {
  829. if (use_direct_io()) {
  830. assert(IsSectorAligned(req.offset, GetRequiredBufferAlignment()));
  831. assert(IsSectorAligned(req.len, GetRequiredBufferAlignment()));
  832. assert(IsSectorAligned(req.scratch, GetRequiredBufferAlignment()));
  833. }
  834. #if defined(ROCKSDB_IOURING_PRESENT)
  835. // io_uring_queue_init.
  836. struct io_uring* iu = nullptr;
  837. if (thread_local_io_urings_) {
  838. iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
  839. if (iu == nullptr) {
  840. iu = CreateIOUring();
  841. if (iu != nullptr) {
  842. thread_local_io_urings_->Reset(iu);
  843. }
  844. }
  845. }
  846. // Init failed, platform doesn't support io_uring.
  847. if (iu == nullptr) {
  848. fprintf(stderr, "failed to init io_uring\n");
  849. return IOStatus::NotSupported("ReadAsync: failed to init io_uring");
  850. }
  851. // Allocate io_handle.
  852. IOHandleDeleter deletefn = [](void* args) -> void {
  853. delete (static_cast<Posix_IOHandle*>(args));
  854. args = nullptr;
  855. };
  856. // Initialize Posix_IOHandle.
  857. Posix_IOHandle* posix_handle =
  858. new Posix_IOHandle(iu, cb, cb_arg, req.offset, req.len, req.scratch,
  859. use_direct_io(), GetRequiredBufferAlignment());
  860. posix_handle->iov.iov_base = req.scratch;
  861. posix_handle->iov.iov_len = req.len;
  862. *io_handle = static_cast<void*>(posix_handle);
  863. *del_fn = deletefn;
  864. // Step 3: io_uring_sqe_set_data
  865. struct io_uring_sqe* sqe;
  866. sqe = io_uring_get_sqe(iu);
  867. io_uring_prep_readv(sqe, fd_, /*sqe->addr=*/&posix_handle->iov,
  868. /*sqe->len=*/1, /*sqe->offset=*/posix_handle->offset);
  869. // Sets sqe->user_data to posix_handle.
  870. io_uring_sqe_set_data(sqe, posix_handle);
  871. // Step 4: io_uring_submit
  872. ssize_t ret = io_uring_submit(iu);
  873. if (ret < 0) {
  874. fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
  875. return IOStatus::IOError("io_uring_submit() requested but returned " +
  876. std::to_string(ret));
  877. }
  878. return IOStatus::OK();
  879. #else
  880. (void)req;
  881. (void)cb;
  882. (void)cb_arg;
  883. (void)io_handle;
  884. (void)del_fn;
  885. return IOStatus::NotSupported(
  886. "ReadAsync: ROCKSDB_IOURING_PRESENT is not set");
  887. #endif
  888. }
  889. /*
  890. * PosixMmapReadableFile
  891. *
  892. * mmap() based random-access
  893. */
  894. // base[0,length-1] contains the mmapped contents of the file.
  895. PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
  896. const std::string& fname,
  897. void* base, size_t length,
  898. const EnvOptions& options)
  899. : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
  900. #ifdef NDEBUG
  901. (void)options;
  902. #endif
  903. fd_ = fd_ + 0; // suppress the warning for used variables
  904. assert(options.use_mmap_reads);
  905. assert(!options.use_direct_reads);
  906. }
  907. PosixMmapReadableFile::~PosixMmapReadableFile() {
  908. int ret = munmap(mmapped_region_, length_);
  909. if (ret != 0) {
  910. fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
  911. mmapped_region_, length_);
  912. }
  913. close(fd_);
  914. }
  915. IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n,
  916. const IOOptions& /*opts*/, Slice* result,
  917. char* /*scratch*/,
  918. IODebugContext* /*dbg*/) const {
  919. IOStatus s;
  920. if (offset > length_) {
  921. *result = Slice();
  922. return IOError("While mmap read offset " + std::to_string(offset) +
  923. " larger than file length " + std::to_string(length_),
  924. filename_, EINVAL);
  925. } else if (offset + n > length_) {
  926. n = static_cast<size_t>(length_ - offset);
  927. }
  928. *result = Slice(static_cast<char*>(mmapped_region_) + offset, n);
  929. return s;
  930. }
  931. void PosixMmapReadableFile::Hint(AccessPattern pattern) {
  932. switch (pattern) {
  933. case kNormal:
  934. Madvise(mmapped_region_, length_, POSIX_MADV_NORMAL);
  935. break;
  936. case kRandom:
  937. Madvise(mmapped_region_, length_, POSIX_MADV_RANDOM);
  938. break;
  939. case kSequential:
  940. Madvise(mmapped_region_, length_, POSIX_MADV_SEQUENTIAL);
  941. break;
  942. case kWillNeed:
  943. Madvise(mmapped_region_, length_, POSIX_MADV_WILLNEED);
  944. break;
  945. case kWontNeed:
  946. Madvise(mmapped_region_, length_, POSIX_MADV_DONTNEED);
  947. break;
  948. default:
  949. assert(false);
  950. break;
  951. }
  952. }
  953. IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
  954. #ifndef OS_LINUX
  955. (void)offset;
  956. (void)length;
  957. return IOStatus::OK();
  958. #else
  959. // free OS pages
  960. int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
  961. if (ret == 0) {
  962. return IOStatus::OK();
  963. }
  964. return IOError("While fadvise not needed. Offset " + std::to_string(offset) +
  965. " len" + std::to_string(length),
  966. filename_, errno);
  967. #endif
  968. }
  969. IOStatus PosixMmapReadableFile::GetFileSize(uint64_t* result) {
  970. *result = length_;
  971. return IOStatus::OK();
  972. }
  973. /*
  974. * PosixMmapFile
  975. *
  976. * We preallocate up to an extra megabyte and use memcpy to append new
  977. * data to the file. This is safe since we either properly close the
  978. * file before reading from it, or for log files, the reading code
  979. * knows enough to skip zero suffixes.
  980. */
  981. IOStatus PosixMmapFile::UnmapCurrentRegion() {
  982. TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
  983. if (base_ != nullptr) {
  984. int munmap_status = munmap(base_, limit_ - base_);
  985. if (munmap_status != 0) {
  986. return IOError("While munmap", filename_, munmap_status);
  987. }
  988. file_offset_ += limit_ - base_;
  989. base_ = nullptr;
  990. limit_ = nullptr;
  991. last_sync_ = nullptr;
  992. dst_ = nullptr;
  993. // Increase the amount we map the next time, but capped at 1MB
  994. if (map_size_ < (1 << 20)) {
  995. map_size_ *= 2;
  996. }
  997. }
  998. return IOStatus::OK();
  999. }
  1000. IOStatus PosixMmapFile::MapNewRegion() {
  1001. #ifdef ROCKSDB_FALLOCATE_PRESENT
  1002. assert(base_ == nullptr);
  1003. TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
  1004. // we can't fallocate with FALLOC_FL_KEEP_SIZE here
  1005. if (allow_fallocate_) {
  1006. IOSTATS_TIMER_GUARD(allocate_nanos);
  1007. int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
  1008. if (alloc_status != 0) {
  1009. // fallback to posix_fallocate
  1010. alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
  1011. }
  1012. if (alloc_status != 0) {
  1013. return IOStatus::IOError("Error allocating space to file : " + filename_ +
  1014. "Error : " + errnoStr(alloc_status).c_str());
  1015. }
  1016. }
  1017. TEST_KILL_RANDOM("PosixMmapFile::Append:1");
  1018. void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
  1019. file_offset_);
  1020. if (ptr == MAP_FAILED) {
  1021. return IOStatus::IOError("MMap failed on " + filename_);
  1022. }
  1023. TEST_KILL_RANDOM("PosixMmapFile::Append:2");
  1024. base_ = static_cast<char*>(ptr);
  1025. limit_ = base_ + map_size_;
  1026. dst_ = base_;
  1027. last_sync_ = base_;
  1028. return IOStatus::OK();
  1029. #else
  1030. return IOStatus::NotSupported("This platform doesn't support fallocate()");
  1031. #endif
  1032. }
  1033. IOStatus PosixMmapFile::Msync() {
  1034. if (dst_ == last_sync_) {
  1035. return IOStatus::OK();
  1036. }
  1037. // Find the beginnings of the pages that contain the first and last
  1038. // bytes to be synced.
  1039. size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
  1040. size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
  1041. last_sync_ = dst_;
  1042. TEST_KILL_RANDOM("PosixMmapFile::Msync:0");
  1043. if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
  1044. return IOError("While msync", filename_, errno);
  1045. }
  1046. return IOStatus::OK();
  1047. }
  1048. PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
  1049. const EnvOptions& options,
  1050. uint64_t initial_file_size)
  1051. : filename_(fname),
  1052. fd_(fd),
  1053. page_size_(page_size),
  1054. map_size_(Roundup(65536, page_size)),
  1055. base_(nullptr),
  1056. limit_(nullptr),
  1057. dst_(nullptr),
  1058. last_sync_(nullptr),
  1059. file_offset_(initial_file_size) {
  1060. #ifdef ROCKSDB_FALLOCATE_PRESENT
  1061. allow_fallocate_ = options.allow_fallocate;
  1062. fallocate_with_keep_size_ = options.fallocate_with_keep_size;
  1063. #else
  1064. (void)options;
  1065. #endif
  1066. assert((page_size & (page_size - 1)) == 0);
  1067. assert(options.use_mmap_writes);
  1068. assert(!options.use_direct_writes);
  1069. }
  1070. PosixMmapFile::~PosixMmapFile() {
  1071. if (fd_ >= 0) {
  1072. IOStatus s = PosixMmapFile::Close(IOOptions(), nullptr);
  1073. s.PermitUncheckedError();
  1074. }
  1075. }
  1076. IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/,
  1077. IODebugContext* /*dbg*/) {
  1078. const char* src = data.data();
  1079. size_t left = data.size();
  1080. while (left > 0) {
  1081. assert(base_ <= dst_);
  1082. assert(dst_ <= limit_);
  1083. size_t avail = limit_ - dst_;
  1084. if (avail == 0) {
  1085. IOStatus s = UnmapCurrentRegion();
  1086. if (!s.ok()) {
  1087. return s;
  1088. }
  1089. s = MapNewRegion();
  1090. if (!s.ok()) {
  1091. return s;
  1092. }
  1093. TEST_KILL_RANDOM("PosixMmapFile::Append:0");
  1094. }
  1095. size_t n = (left <= avail) ? left : avail;
  1096. assert(dst_);
  1097. memcpy(dst_, src, n);
  1098. dst_ += n;
  1099. src += n;
  1100. left -= n;
  1101. }
  1102. return IOStatus::OK();
  1103. }
  1104. IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/,
  1105. IODebugContext* /*dbg*/) {
  1106. IOStatus s;
  1107. size_t unused = limit_ - dst_;
  1108. s = UnmapCurrentRegion();
  1109. if (!s.ok()) {
  1110. s = IOError("While closing mmapped file", filename_, errno);
  1111. } else if (unused > 0) {
  1112. // Trim the extra space at the end of the file
  1113. if (ftruncate(fd_, file_offset_ - unused) < 0) {
  1114. s = IOError("While ftruncating mmaped file", filename_, errno);
  1115. }
  1116. }
  1117. if (close(fd_) < 0) {
  1118. if (s.ok()) {
  1119. s = IOError("While closing mmapped file", filename_, errno);
  1120. }
  1121. }
  1122. fd_ = -1;
  1123. base_ = nullptr;
  1124. limit_ = nullptr;
  1125. return s;
  1126. }
  1127. IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/,
  1128. IODebugContext* /*dbg*/) {
  1129. return IOStatus::OK();
  1130. }
  1131. IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/,
  1132. IODebugContext* /*dbg*/) {
  1133. #ifdef HAVE_FULLFSYNC
  1134. if (::fcntl(fd_, F_FULLFSYNC) < 0) {
  1135. return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno);
  1136. }
  1137. #else // HAVE_FULLFSYNC
  1138. if (fdatasync(fd_) < 0) {
  1139. return IOError("While fdatasync mmapped file", filename_, errno);
  1140. }
  1141. #endif // HAVE_FULLFSYNC
  1142. return Msync();
  1143. }
  1144. /**
  1145. * Flush data as well as metadata to stable storage.
  1146. */
  1147. IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/,
  1148. IODebugContext* /*dbg*/) {
  1149. #ifdef HAVE_FULLFSYNC
  1150. if (::fcntl(fd_, F_FULLFSYNC) < 0) {
  1151. return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno);
  1152. }
  1153. #else // HAVE_FULLFSYNC
  1154. if (fsync(fd_) < 0) {
  1155. return IOError("While fsync mmaped file", filename_, errno);
  1156. }
  1157. #endif // HAVE_FULLFSYNC
  1158. return Msync();
  1159. }
  1160. /**
  1161. * Get the size of valid data in the file. This will not match the
  1162. * size that is returned from the filesystem because we use mmap
  1163. * to extend file by map_size every time.
  1164. */
  1165. uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/,
  1166. IODebugContext* /*dbg*/) {
  1167. size_t used = dst_ - base_;
  1168. return file_offset_ + used;
  1169. }
  1170. IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
  1171. #ifndef OS_LINUX
  1172. (void)offset;
  1173. (void)length;
  1174. return IOStatus::OK();
  1175. #else
  1176. // free OS pages
  1177. int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
  1178. if (ret == 0) {
  1179. return IOStatus::OK();
  1180. }
  1181. return IOError("While fadvise NotNeeded mmapped file", filename_, errno);
  1182. #endif
  1183. }
  1184. #ifdef ROCKSDB_FALLOCATE_PRESENT
  1185. IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len,
  1186. const IOOptions& /*opts*/,
  1187. IODebugContext* /*dbg*/) {
  1188. assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  1189. assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  1190. TEST_KILL_RANDOM("PosixMmapFile::Allocate:0");
  1191. int alloc_status = 0;
  1192. if (allow_fallocate_) {
  1193. alloc_status =
  1194. fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
  1195. static_cast<off_t>(offset), static_cast<off_t>(len));
  1196. }
  1197. if (alloc_status == 0) {
  1198. return IOStatus::OK();
  1199. } else {
  1200. return IOError("While fallocate offset " + std::to_string(offset) +
  1201. " len " + std::to_string(len),
  1202. filename_, errno);
  1203. }
  1204. }
  1205. #endif
  1206. /*
  1207. * PosixWritableFile
  1208. *
  1209. * Use posix write to write data to a file.
  1210. */
  1211. PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
  1212. size_t logical_block_size,
  1213. const EnvOptions& options,
  1214. uint64_t initial_file_size)
  1215. : FSWritableFile(options),
  1216. filename_(fname),
  1217. use_direct_io_(options.use_direct_writes),
  1218. fd_(fd),
  1219. filesize_(initial_file_size),
  1220. logical_sector_size_(logical_block_size) {
  1221. #ifdef ROCKSDB_FALLOCATE_PRESENT
  1222. allow_fallocate_ = options.allow_fallocate;
  1223. fallocate_with_keep_size_ = options.fallocate_with_keep_size;
  1224. #endif
  1225. #ifdef ROCKSDB_RANGESYNC_PRESENT
  1226. sync_file_range_supported_ = IsSyncFileRangeSupported(fd_);
  1227. #endif // ROCKSDB_RANGESYNC_PRESENT
  1228. assert(!options.use_mmap_writes);
  1229. }
  1230. PosixWritableFile::~PosixWritableFile() {
  1231. if (fd_ >= 0) {
  1232. IOStatus s = PosixWritableFile::Close(IOOptions(), nullptr);
  1233. s.PermitUncheckedError();
  1234. }
  1235. }
  1236. IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/,
  1237. IODebugContext* /*dbg*/) {
  1238. if (use_direct_io()) {
  1239. assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
  1240. assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
  1241. }
  1242. const char* src = data.data();
  1243. size_t nbytes = data.size();
  1244. if (!PosixWrite(fd_, src, nbytes)) {
  1245. return IOError("While appending to file", filename_, errno);
  1246. }
  1247. filesize_ += nbytes;
  1248. return IOStatus::OK();
  1249. }
  1250. IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
  1251. const IOOptions& /*opts*/,
  1252. IODebugContext* /*dbg*/) {
  1253. if (use_direct_io()) {
  1254. assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
  1255. assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
  1256. assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
  1257. }
  1258. assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  1259. const char* src = data.data();
  1260. size_t nbytes = data.size();
  1261. if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
  1262. return IOError("While pwrite to file at offset " + std::to_string(offset),
  1263. filename_, errno);
  1264. }
  1265. filesize_ = offset + nbytes;
  1266. return IOStatus::OK();
  1267. }
  1268. IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/,
  1269. IODebugContext* /*dbg*/) {
  1270. IOStatus s;
  1271. int r = ftruncate(fd_, size);
  1272. if (r < 0) {
  1273. s = IOError("While ftruncate file to size " + std::to_string(size),
  1274. filename_, errno);
  1275. } else {
  1276. filesize_ = size;
  1277. }
  1278. return s;
  1279. }
  1280. IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/,
  1281. IODebugContext* /*dbg*/) {
  1282. IOStatus s;
  1283. size_t block_size;
  1284. size_t last_allocated_block;
  1285. GetPreallocationStatus(&block_size, &last_allocated_block);
  1286. TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block);
  1287. if (last_allocated_block > 0) {
  1288. // trim the extra space preallocated at the end of the file
  1289. // NOTE(ljin): we probably don't want to surface failure as an IOError,
  1290. // but it will be nice to log these errors.
  1291. int dummy __attribute__((__unused__));
  1292. dummy = ftruncate(fd_, filesize_);
  1293. #if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE)
  1294. // in some file systems, ftruncate only trims trailing space if the
  1295. // new file size is smaller than the current size. Calling fallocate
  1296. // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
  1297. // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
  1298. // filesystems:
  1299. // XFS (since Linux 2.6.38)
  1300. // ext4 (since Linux 3.0)
  1301. // Btrfs (since Linux 3.7)
  1302. // tmpfs (since Linux 3.5)
  1303. // We ignore error since failure of this operation does not affect
  1304. // correctness.
  1305. struct stat file_stats;
  1306. int result = fstat(fd_, &file_stats);
  1307. // After ftruncate, we check whether ftruncate has the correct behavior.
  1308. // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
  1309. if (result == 0 &&
  1310. static_cast<size_t>((file_stats.st_size + file_stats.st_blksize - 1) /
  1311. file_stats.st_blksize) !=
  1312. static_cast<size_t>(file_stats.st_blocks /
  1313. (file_stats.st_blksize / 512))) {
  1314. IOSTATS_TIMER_GUARD(allocate_nanos);
  1315. if (allow_fallocate_) {
  1316. fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
  1317. block_size * last_allocated_block - filesize_);
  1318. }
  1319. }
  1320. #endif
  1321. }
  1322. if (close(fd_) < 0) {
  1323. s = IOError("While closing file after writing", filename_, errno);
  1324. }
  1325. fd_ = -1;
  1326. return s;
  1327. }
  1328. // write out the cached data to the OS cache
  1329. IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/,
  1330. IODebugContext* /*dbg*/) {
  1331. return IOStatus::OK();
  1332. }
  1333. IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/,
  1334. IODebugContext* /*dbg*/) {
  1335. #ifdef HAVE_FULLFSYNC
  1336. if (::fcntl(fd_, F_FULLFSYNC) < 0) {
  1337. return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
  1338. }
  1339. #else // HAVE_FULLFSYNC
  1340. if (fdatasync(fd_) < 0) {
  1341. return IOError("While fdatasync", filename_, errno);
  1342. }
  1343. #endif // HAVE_FULLFSYNC
  1344. return IOStatus::OK();
  1345. }
  1346. IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/,
  1347. IODebugContext* /*dbg*/) {
  1348. #ifdef HAVE_FULLFSYNC
  1349. if (::fcntl(fd_, F_FULLFSYNC) < 0) {
  1350. return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
  1351. }
  1352. #else // HAVE_FULLFSYNC
  1353. if (fsync(fd_) < 0) {
  1354. return IOError("While fsync", filename_, errno);
  1355. }
  1356. #endif // HAVE_FULLFSYNC
  1357. return IOStatus::OK();
  1358. }
  1359. bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
  1360. uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/,
  1361. IODebugContext* /*dbg*/) {
  1362. return filesize_;
  1363. }
  1364. void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
  1365. #ifdef OS_LINUX
  1366. // Suppress Valgrind "Unimplemented functionality" error.
  1367. #ifndef ROCKSDB_VALGRIND_RUN
  1368. uint64_t fcntl_hint = hint;
  1369. if (hint == write_hint_) {
  1370. return;
  1371. }
  1372. if (fcntl(fd_, F_SET_RW_HINT, &fcntl_hint) == 0) {
  1373. write_hint_ = hint;
  1374. }
  1375. #else
  1376. (void)hint;
  1377. #endif // ROCKSDB_VALGRIND_RUN
  1378. #else
  1379. (void)hint;
  1380. #endif // OS_LINUX
  1381. }
  1382. IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
  1383. if (use_direct_io()) {
  1384. return IOStatus::OK();
  1385. }
  1386. #ifndef OS_LINUX
  1387. (void)offset;
  1388. (void)length;
  1389. return IOStatus::OK();
  1390. #else
  1391. // free OS pages
  1392. int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
  1393. if (ret == 0) {
  1394. return IOStatus::OK();
  1395. }
  1396. return IOError("While fadvise NotNeeded", filename_, errno);
  1397. #endif
  1398. }
  1399. #ifdef ROCKSDB_FALLOCATE_PRESENT
  1400. IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len,
  1401. const IOOptions& /*opts*/,
  1402. IODebugContext* /*dbg*/) {
  1403. assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  1404. assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  1405. TEST_KILL_RANDOM("PosixWritableFile::Allocate:0");
  1406. IOSTATS_TIMER_GUARD(allocate_nanos);
  1407. int alloc_status = 0;
  1408. if (allow_fallocate_) {
  1409. alloc_status =
  1410. fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
  1411. static_cast<off_t>(offset), static_cast<off_t>(len));
  1412. }
  1413. if (alloc_status == 0) {
  1414. return IOStatus::OK();
  1415. } else {
  1416. return IOError("While fallocate offset " + std::to_string(offset) +
  1417. " len " + std::to_string(len),
  1418. filename_, errno);
  1419. }
  1420. }
  1421. #endif
  1422. IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
  1423. const IOOptions& opts,
  1424. IODebugContext* dbg) {
  1425. #ifdef ROCKSDB_RANGESYNC_PRESENT
  1426. assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  1427. assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
  1428. if (sync_file_range_supported_) {
  1429. int ret;
  1430. if (strict_bytes_per_sync_) {
  1431. // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length
  1432. // that spans all bytes written so far tells `sync_file_range` to wait for
  1433. // any outstanding writeback requests to finish before issuing a new one.
  1434. ret =
  1435. sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes),
  1436. SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE);
  1437. } else {
  1438. ret = sync_file_range(fd_, static_cast<off_t>(offset),
  1439. static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE);
  1440. }
  1441. if (ret != 0) {
  1442. return IOError("While sync_file_range returned " + std::to_string(ret),
  1443. filename_, errno);
  1444. }
  1445. return IOStatus::OK();
  1446. }
  1447. #endif // ROCKSDB_RANGESYNC_PRESENT
  1448. return FSWritableFile::RangeSync(offset, nbytes, opts, dbg);
  1449. }
  1450. #ifdef OS_LINUX
  1451. size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
  1452. return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
  1453. }
  1454. #endif
  1455. /*
  1456. * PosixRandomRWFile
  1457. */
  1458. PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd,
  1459. const EnvOptions& /*options*/)
  1460. : filename_(fname), fd_(fd) {}
  1461. PosixRandomRWFile::~PosixRandomRWFile() {
  1462. if (fd_ >= 0) {
  1463. IOStatus s = Close(IOOptions(), nullptr);
  1464. s.PermitUncheckedError();
  1465. }
  1466. }
  1467. IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data,
  1468. const IOOptions& /*opts*/,
  1469. IODebugContext* /*dbg*/) {
  1470. const char* src = data.data();
  1471. size_t nbytes = data.size();
  1472. if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
  1473. return IOError("While write random read/write file at offset " +
  1474. std::to_string(offset),
  1475. filename_, errno);
  1476. }
  1477. return IOStatus::OK();
  1478. }
  1479. IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n,
  1480. const IOOptions& /*opts*/, Slice* result,
  1481. char* scratch, IODebugContext* /*dbg*/) const {
  1482. size_t left = n;
  1483. char* ptr = scratch;
  1484. while (left > 0) {
  1485. ssize_t done = pread(fd_, ptr, left, offset);
  1486. if (done < 0) {
  1487. // error while reading from file
  1488. if (errno == EINTR) {
  1489. // read was interrupted, try again.
  1490. continue;
  1491. }
  1492. return IOError("While reading random read/write file offset " +
  1493. std::to_string(offset) + " len " + std::to_string(n),
  1494. filename_, errno);
  1495. } else if (done == 0) {
  1496. // Nothing more to read
  1497. break;
  1498. }
  1499. // Read `done` bytes
  1500. ptr += done;
  1501. offset += done;
  1502. left -= done;
  1503. }
  1504. *result = Slice(scratch, n - left);
  1505. return IOStatus::OK();
  1506. }
  1507. IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/,
  1508. IODebugContext* /*dbg*/) {
  1509. return IOStatus::OK();
  1510. }
  1511. IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/,
  1512. IODebugContext* /*dbg*/) {
  1513. #ifdef HAVE_FULLFSYNC
  1514. if (::fcntl(fd_, F_FULLFSYNC) < 0) {
  1515. return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno);
  1516. }
  1517. #else // HAVE_FULLFSYNC
  1518. if (fdatasync(fd_) < 0) {
  1519. return IOError("While fdatasync random read/write file", filename_, errno);
  1520. }
  1521. #endif // HAVE_FULLFSYNC
  1522. return IOStatus::OK();
  1523. }
  1524. IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/,
  1525. IODebugContext* /*dbg*/) {
  1526. #ifdef HAVE_FULLFSYNC
  1527. if (::fcntl(fd_, F_FULLFSYNC) < 0) {
  1528. return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno);
  1529. }
  1530. #else // HAVE_FULLFSYNC
  1531. if (fsync(fd_) < 0) {
  1532. return IOError("While fsync random read/write file", filename_, errno);
  1533. }
  1534. #endif // HAVE_FULLFSYNC
  1535. return IOStatus::OK();
  1536. }
  1537. IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/,
  1538. IODebugContext* /*dbg*/) {
  1539. if (close(fd_) < 0) {
  1540. return IOError("While close random read/write file", filename_, errno);
  1541. }
  1542. fd_ = -1;
  1543. return IOStatus::OK();
  1544. }
  1545. PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
  1546. // TODO should have error handling though not much we can do...
  1547. munmap(this->base_, length_);
  1548. }
  1549. /*
  1550. * PosixDirectory
  1551. */
  1552. #if !defined(BTRFS_SUPER_MAGIC)
  1553. // The magic number for BTRFS is fixed, if it's not defined, define it here
  1554. #define BTRFS_SUPER_MAGIC 0x9123683E
  1555. #endif
  1556. PosixDirectory::PosixDirectory(int fd, const std::string& directory_name)
  1557. : fd_(fd), directory_name_(directory_name) {
  1558. is_btrfs_ = false;
  1559. #ifdef OS_LINUX
  1560. struct statfs buf;
  1561. int ret = fstatfs(fd, &buf);
  1562. is_btrfs_ = (ret == 0 && buf.f_type == static_cast<decltype(buf.f_type)>(
  1563. BTRFS_SUPER_MAGIC));
  1564. #endif
  1565. }
  1566. PosixDirectory::~PosixDirectory() {
  1567. if (fd_ >= 0) {
  1568. IOStatus s = PosixDirectory::Close(IOOptions(), nullptr);
  1569. s.PermitUncheckedError();
  1570. }
  1571. }
  1572. IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) {
  1573. return FsyncWithDirOptions(opts, dbg, DirFsyncOptions());
  1574. }
  1575. // Users who want the file entries synced in Directory project must call a
  1576. // Fsync or FsyncWithDirOptions function before Close
  1577. IOStatus PosixDirectory::Close(const IOOptions& /*opts*/,
  1578. IODebugContext* /*dbg*/) {
  1579. IOStatus s = IOStatus::OK();
  1580. if (close(fd_) < 0) {
  1581. s = IOError("While closing directory ", directory_name_, errno);
  1582. } else {
  1583. fd_ = -1;
  1584. }
  1585. return s;
  1586. }
  1587. IOStatus PosixDirectory::FsyncWithDirOptions(
  1588. const IOOptions& /*opts*/, IODebugContext* /*dbg*/,
  1589. const DirFsyncOptions& dir_fsync_options) {
  1590. assert(fd_ >= 0); // Check use after close
  1591. IOStatus s = IOStatus::OK();
  1592. #ifndef OS_AIX
  1593. if (is_btrfs_) {
  1594. // skip dir fsync for new file creation, which is not needed for btrfs
  1595. if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) {
  1596. return s;
  1597. }
  1598. // skip dir fsync for renaming file, only need to sync new file
  1599. if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) {
  1600. std::string new_name = dir_fsync_options.renamed_new_name;
  1601. assert(!new_name.empty());
  1602. int fd;
  1603. do {
  1604. IOSTATS_TIMER_GUARD(open_nanos);
  1605. fd = open(new_name.c_str(), O_RDONLY);
  1606. } while (fd < 0 && errno == EINTR);
  1607. if (fd < 0) {
  1608. s = IOError("While open renaming file", new_name, errno);
  1609. } else if (fsync(fd) < 0) {
  1610. s = IOError("While fsync renaming file", new_name, errno);
  1611. }
  1612. if (close(fd) < 0) {
  1613. s = IOError("While closing file after fsync", new_name, errno);
  1614. }
  1615. return s;
  1616. }
  1617. // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted
  1618. }
  1619. // skip fsync/fcntl when fd_ == -1 since this file descriptor has been closed
  1620. // in either the de-construction or the close function, data must have been
  1621. // fsync-ed before de-construction and close is called
  1622. #ifdef HAVE_FULLFSYNC
  1623. // btrfs is a Linux file system, while currently F_FULLFSYNC is available on
  1624. // Mac OS.
  1625. assert(!is_btrfs_);
  1626. if (fd_ != -1 && ::fcntl(fd_, F_FULLFSYNC) < 0) {
  1627. return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno);
  1628. }
  1629. #else // HAVE_FULLFSYNC
  1630. if (fd_ != -1 && fsync(fd_) == -1) {
  1631. s = IOError("While fsync", "a directory", errno);
  1632. }
  1633. #endif // HAVE_FULLFSYNC
  1634. #endif // OS_AIX
  1635. return s;
  1636. }
  1637. } // namespace ROCKSDB_NAMESPACE
  1638. #endif