io_win.cc 30 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #include "port/win/io_win.h"
  10. #include "monitoring/iostats_context_imp.h"
  11. #include "test_util/sync_point.h"
  12. #include "util/aligned_buffer.h"
  13. #include "util/coding.h"
  14. namespace ROCKSDB_NAMESPACE {
  15. namespace port {
  16. /*
  17. * DirectIOHelper
  18. */
  19. namespace {
  20. const size_t kSectorSize = 512;
  21. inline
  22. bool IsPowerOfTwo(const size_t alignment) {
  23. return ((alignment) & (alignment - 1)) == 0;
  24. }
  25. inline
  26. bool IsSectorAligned(const size_t off) {
  27. return (off & (kSectorSize - 1)) == 0;
  28. }
  29. inline
  30. bool IsAligned(size_t alignment, const void* ptr) {
  31. return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
  32. }
  33. }
  34. std::string GetWindowsErrSz(DWORD err) {
  35. LPSTR lpMsgBuf;
  36. FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
  37. FORMAT_MESSAGE_IGNORE_INSERTS,
  38. NULL, err,
  39. 0, // Default language
  40. reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
  41. std::string Err = lpMsgBuf;
  42. LocalFree(lpMsgBuf);
  43. return Err;
  44. }
  45. // We preserve the original name of this interface to denote the original idea
  46. // behind it.
  47. // All reads happen by a specified offset and pwrite interface does not change
  48. // the position of the file pointer. Judging from the man page and errno it does
  49. // execute
  50. // lseek atomically to return the position of the file back where it was.
  51. // WriteFile() does not
  52. // have this capability. Therefore, for both pread and pwrite the pointer is
  53. // advanced to the next position
  54. // which is fine for writes because they are (should be) sequential.
  55. // Because all the reads/writes happen by the specified offset, the caller in
  56. // theory should not
  57. // rely on the current file offset.
  58. Status pwrite(const WinFileData* file_data, const Slice& data,
  59. uint64_t offset, size_t& bytes_written) {
  60. Status s;
  61. bytes_written = 0;
  62. size_t num_bytes = data.size();
  63. if (num_bytes > std::numeric_limits<DWORD>::max()) {
  64. // May happen in 64-bit builds where size_t is 64-bits but
  65. // long is still 32-bit, but that's the API here at the moment
  66. return Status::InvalidArgument("num_bytes is too large for a single write: " +
  67. file_data->GetName());
  68. }
  69. OVERLAPPED overlapped = { 0 };
  70. ULARGE_INTEGER offsetUnion;
  71. offsetUnion.QuadPart = offset;
  72. overlapped.Offset = offsetUnion.LowPart;
  73. overlapped.OffsetHigh = offsetUnion.HighPart;
  74. DWORD bytesWritten = 0;
  75. if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast<DWORD>(num_bytes),
  76. &bytesWritten, &overlapped)) {
  77. auto lastError = GetLastError();
  78. s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
  79. lastError);
  80. } else {
  81. bytes_written = bytesWritten;
  82. }
  83. return s;
  84. }
  85. // See comments for pwrite above
  86. Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
  87. uint64_t offset, size_t& bytes_read) {
  88. Status s;
  89. bytes_read = 0;
  90. if (num_bytes > std::numeric_limits<DWORD>::max()) {
  91. return Status::InvalidArgument("num_bytes is too large for a single read: " +
  92. file_data->GetName());
  93. }
  94. OVERLAPPED overlapped = { 0 };
  95. ULARGE_INTEGER offsetUnion;
  96. offsetUnion.QuadPart = offset;
  97. overlapped.Offset = offsetUnion.LowPart;
  98. overlapped.OffsetHigh = offsetUnion.HighPart;
  99. DWORD bytesRead = 0;
  100. if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast<DWORD>(num_bytes),
  101. &bytesRead, &overlapped)) {
  102. auto lastError = GetLastError();
  103. // EOF is OK with zero bytes read
  104. if (lastError != ERROR_HANDLE_EOF) {
  105. s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
  106. lastError);
  107. }
  108. } else {
  109. bytes_read = bytesRead;
  110. }
  111. return s;
  112. }
  113. // SetFileInformationByHandle() is capable of fast pre-allocates.
  114. // However, this does not change the file end position unless the file is
  115. // truncated and the pre-allocated space is not considered filled with zeros.
  116. Status fallocate(const std::string& filename, HANDLE hFile,
  117. uint64_t to_size) {
  118. Status status;
  119. FILE_ALLOCATION_INFO alloc_info;
  120. alloc_info.AllocationSize.QuadPart = to_size;
  121. if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
  122. sizeof(FILE_ALLOCATION_INFO))) {
  123. auto lastError = GetLastError();
  124. status = IOErrorFromWindowsError(
  125. "Failed to pre-allocate space: " + filename, lastError);
  126. }
  127. return status;
  128. }
  129. Status ftruncate(const std::string& filename, HANDLE hFile,
  130. uint64_t toSize) {
  131. Status status;
  132. FILE_END_OF_FILE_INFO end_of_file;
  133. end_of_file.EndOfFile.QuadPart = toSize;
  134. if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
  135. sizeof(FILE_END_OF_FILE_INFO))) {
  136. auto lastError = GetLastError();
  137. status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
  138. lastError);
  139. }
  140. return status;
  141. }
  142. size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
  143. size_t /*max_size*/) {
  144. // Returning 0 is safe as it causes the table reader to generate a unique ID.
  145. // This is suboptimal for performance as it prevents multiple table readers
  146. // for the same file from sharing cached blocks. For example, if users have
  147. // a low value for `max_open_files`, there can be many table readers opened
  148. // for the same file.
  149. //
  150. // TODO: this is a temporarily solution as it is safe but not optimal for
  151. // performance. For more details see discussion in
  152. // https://github.com/facebook/rocksdb/pull/5844.
  153. return 0;
  154. }
  155. ////////////////////////////////////////////////////////////////////////////////////////////////////
  156. // WinMmapReadableFile
  157. WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
  158. HANDLE hFile, HANDLE hMap,
  159. const void* mapped_region,
  160. size_t length)
  161. : WinFileData(fileName, hFile, false /* use_direct_io */),
  162. hMap_(hMap),
  163. mapped_region_(mapped_region),
  164. length_(length) {}
  165. WinMmapReadableFile::~WinMmapReadableFile() {
  166. BOOL ret __attribute__((__unused__));
  167. ret = ::UnmapViewOfFile(mapped_region_);
  168. assert(ret);
  169. ret = ::CloseHandle(hMap_);
  170. assert(ret);
  171. }
  172. Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
  173. char* scratch) const {
  174. Status s;
  175. if (offset > length_) {
  176. *result = Slice();
  177. return IOError(filename_, EINVAL);
  178. } else if (offset + n > length_) {
  179. n = length_ - static_cast<size_t>(offset);
  180. }
  181. *result =
  182. Slice(reinterpret_cast<const char*>(mapped_region_)+offset, n);
  183. return s;
  184. }
  185. Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
  186. return Status::OK();
  187. }
  188. size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
  189. return GetUniqueIdFromFile(hFile_, id, max_size);
  190. }
  191. ///////////////////////////////////////////////////////////////////////////////
  192. /// WinMmapFile
  193. // Can only truncate or reserve to a sector size aligned if
  194. // used on files that are opened with Unbuffered I/O
  195. Status WinMmapFile::TruncateFile(uint64_t toSize) {
  196. return ftruncate(filename_, hFile_, toSize);
  197. }
  198. Status WinMmapFile::UnmapCurrentRegion() {
  199. Status status;
  200. if (mapped_begin_ != nullptr) {
  201. if (!::UnmapViewOfFile(mapped_begin_)) {
  202. status = IOErrorFromWindowsError(
  203. "Failed to unmap file view: " + filename_, GetLastError());
  204. }
  205. // Move on to the next portion of the file
  206. file_offset_ += view_size_;
  207. // UnmapView automatically sends data to disk but not the metadata
  208. // which is good and provides some equivalent of fdatasync() on Linux
  209. // therefore, we donot need separate flag for metadata
  210. mapped_begin_ = nullptr;
  211. mapped_end_ = nullptr;
  212. dst_ = nullptr;
  213. last_sync_ = nullptr;
  214. pending_sync_ = false;
  215. }
  216. return status;
  217. }
  218. Status WinMmapFile::MapNewRegion() {
  219. Status status;
  220. assert(mapped_begin_ == nullptr);
  221. size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
  222. if (minDiskSize > reserved_size_) {
  223. status = Allocate(file_offset_, view_size_);
  224. if (!status.ok()) {
  225. return status;
  226. }
  227. }
  228. // Need to remap
  229. if (hMap_ == NULL || reserved_size_ > mapping_size_) {
  230. if (hMap_ != NULL) {
  231. // Unmap the previous one
  232. BOOL ret __attribute__((__unused__));
  233. ret = ::CloseHandle(hMap_);
  234. assert(ret);
  235. hMap_ = NULL;
  236. }
  237. ULARGE_INTEGER mappingSize;
  238. mappingSize.QuadPart = reserved_size_;
  239. hMap_ = CreateFileMappingA(
  240. hFile_,
  241. NULL, // Security attributes
  242. PAGE_READWRITE, // There is not a write only mode for mapping
  243. mappingSize.HighPart, // Enable mapping the whole file but the actual
  244. // amount mapped is determined by MapViewOfFile
  245. mappingSize.LowPart,
  246. NULL); // Mapping name
  247. if (NULL == hMap_) {
  248. return IOErrorFromWindowsError(
  249. "WindowsMmapFile failed to create file mapping for: " + filename_,
  250. GetLastError());
  251. }
  252. mapping_size_ = reserved_size_;
  253. }
  254. ULARGE_INTEGER offset;
  255. offset.QuadPart = file_offset_;
  256. // View must begin at the granularity aligned offset
  257. mapped_begin_ = reinterpret_cast<char*>(
  258. MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
  259. view_size_, NULL));
  260. if (!mapped_begin_) {
  261. status = IOErrorFromWindowsError(
  262. "WindowsMmapFile failed to map file view: " + filename_,
  263. GetLastError());
  264. } else {
  265. mapped_end_ = mapped_begin_ + view_size_;
  266. dst_ = mapped_begin_;
  267. last_sync_ = mapped_begin_;
  268. pending_sync_ = false;
  269. }
  270. return status;
  271. }
  272. Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
  273. return fallocate(filename_, hFile_, spaceToReserve);
  274. }
  275. WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
  276. size_t page_size, size_t allocation_granularity,
  277. const EnvOptions& options)
  278. : WinFileData(fname, hFile, false),
  279. WritableFile(options),
  280. hMap_(NULL),
  281. page_size_(page_size),
  282. allocation_granularity_(allocation_granularity),
  283. reserved_size_(0),
  284. mapping_size_(0),
  285. view_size_(0),
  286. mapped_begin_(nullptr),
  287. mapped_end_(nullptr),
  288. dst_(nullptr),
  289. last_sync_(nullptr),
  290. file_offset_(0),
  291. pending_sync_(false) {
  292. // Allocation granularity must be obtained from GetSystemInfo() and must be
  293. // a power of two.
  294. assert(allocation_granularity > 0);
  295. assert((allocation_granularity & (allocation_granularity - 1)) == 0);
  296. assert(page_size > 0);
  297. assert((page_size & (page_size - 1)) == 0);
  298. // Only for memory mapped writes
  299. assert(options.use_mmap_writes);
  300. // View size must be both the multiple of allocation_granularity AND the
  301. // page size and the granularity is usually a multiple of a page size.
  302. const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
  303. view_size_ = Roundup(viewSize, allocation_granularity_);
  304. }
  305. WinMmapFile::~WinMmapFile() {
  306. if (hFile_) {
  307. this->Close();
  308. }
  309. }
  310. Status WinMmapFile::Append(const Slice& data) {
  311. const char* src = data.data();
  312. size_t left = data.size();
  313. while (left > 0) {
  314. assert(mapped_begin_ <= dst_);
  315. size_t avail = mapped_end_ - dst_;
  316. if (avail == 0) {
  317. Status s = UnmapCurrentRegion();
  318. if (s.ok()) {
  319. s = MapNewRegion();
  320. }
  321. if (!s.ok()) {
  322. return s;
  323. }
  324. } else {
  325. size_t n = std::min(left, avail);
  326. memcpy(dst_, src, n);
  327. dst_ += n;
  328. src += n;
  329. left -= n;
  330. pending_sync_ = true;
  331. }
  332. }
  333. // Now make sure that the last partial page is padded with zeros if needed
  334. size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
  335. if (bytesToPad > 0) {
  336. memset(dst_, 0, bytesToPad);
  337. }
  338. return Status::OK();
  339. }
  340. // Means Close() will properly take care of truncate
  341. // and it does not need any additional information
  342. Status WinMmapFile::Truncate(uint64_t size) {
  343. return Status::OK();
  344. }
  345. Status WinMmapFile::Close() {
  346. Status s;
  347. assert(NULL != hFile_);
  348. // We truncate to the precise size so no
  349. // uninitialized data at the end. SetEndOfFile
  350. // which we use does not write zeros and it is good.
  351. uint64_t targetSize = GetFileSize();
  352. if (mapped_begin_ != nullptr) {
  353. // Sync before unmapping to make sure everything
  354. // is on disk and there is not a lazy writing
  355. // so we are deterministic with the tests
  356. Sync();
  357. s = UnmapCurrentRegion();
  358. }
  359. if (NULL != hMap_) {
  360. BOOL ret = ::CloseHandle(hMap_);
  361. if (!ret && s.ok()) {
  362. auto lastError = GetLastError();
  363. s = IOErrorFromWindowsError(
  364. "Failed to Close mapping for file: " + filename_, lastError);
  365. }
  366. hMap_ = NULL;
  367. }
  368. if (hFile_ != NULL) {
  369. TruncateFile(targetSize);
  370. BOOL ret = ::CloseHandle(hFile_);
  371. hFile_ = NULL;
  372. if (!ret && s.ok()) {
  373. auto lastError = GetLastError();
  374. s = IOErrorFromWindowsError(
  375. "Failed to close file map handle: " + filename_, lastError);
  376. }
  377. }
  378. return s;
  379. }
  380. Status WinMmapFile::Flush() { return Status::OK(); }
  381. // Flush only data
  382. Status WinMmapFile::Sync() {
  383. Status s;
  384. // Some writes occurred since last sync
  385. if (dst_ > last_sync_) {
  386. assert(mapped_begin_);
  387. assert(dst_);
  388. assert(dst_ > mapped_begin_);
  389. assert(dst_ < mapped_end_);
  390. size_t page_begin =
  391. TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
  392. size_t page_end =
  393. TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
  394. // Flush only the amount of that is a multiple of pages
  395. if (!::FlushViewOfFile(mapped_begin_ + page_begin,
  396. (page_end - page_begin) + page_size_)) {
  397. s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
  398. GetLastError());
  399. } else {
  400. last_sync_ = dst_;
  401. }
  402. }
  403. return s;
  404. }
  405. /**
  406. * Flush data as well as metadata to stable storage.
  407. */
  408. Status WinMmapFile::Fsync() {
  409. Status s = Sync();
  410. // Flush metadata
  411. if (s.ok() && pending_sync_) {
  412. if (!::FlushFileBuffers(hFile_)) {
  413. s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
  414. GetLastError());
  415. }
  416. pending_sync_ = false;
  417. }
  418. return s;
  419. }
  420. /**
  421. * Get the size of valid data in the file. This will not match the
  422. * size that is returned from the filesystem because we use mmap
  423. * to extend file by map_size every time.
  424. */
  425. uint64_t WinMmapFile::GetFileSize() {
  426. size_t used = dst_ - mapped_begin_;
  427. return file_offset_ + used;
  428. }
  429. Status WinMmapFile::InvalidateCache(size_t offset, size_t length) {
  430. return Status::OK();
  431. }
  432. Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) {
  433. Status status;
  434. TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds);
  435. // Make sure that we reserve an aligned amount of space
  436. // since the reservation block size is driven outside so we want
  437. // to check if we are ok with reservation here
  438. size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), view_size_);
  439. // Nothing to do
  440. if (spaceToReserve <= reserved_size_) {
  441. return status;
  442. }
  443. IOSTATS_TIMER_GUARD(allocate_nanos);
  444. status = PreallocateInternal(spaceToReserve);
  445. if (status.ok()) {
  446. reserved_size_ = spaceToReserve;
  447. }
  448. return status;
  449. }
  450. size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
  451. return GetUniqueIdFromFile(hFile_, id, max_size);
  452. }
  453. //////////////////////////////////////////////////////////////////////////////////
  454. // WinSequentialFile
  455. WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
  456. const EnvOptions& options)
  457. : WinFileData(fname, f, options.use_direct_reads) {}
  458. WinSequentialFile::~WinSequentialFile() {
  459. assert(hFile_ != INVALID_HANDLE_VALUE);
  460. }
  461. Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) {
  462. Status s;
  463. size_t r = 0;
  464. assert(result != nullptr);
  465. if (WinFileData::use_direct_io()) {
  466. return Status::NotSupported("Read() does not support direct_io");
  467. }
  468. // Windows ReadFile API accepts a DWORD.
  469. // While it is possible to read in a loop if n is too big
  470. // it is an unlikely case.
  471. if (n > std::numeric_limits<DWORD>::max()) {
  472. return Status::InvalidArgument("n is too big for a single ReadFile: "
  473. + filename_);
  474. }
  475. DWORD bytesToRead = static_cast<DWORD>(n); //cast is safe due to the check above
  476. DWORD bytesRead = 0;
  477. BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
  478. if (ret != FALSE) {
  479. r = bytesRead;
  480. } else {
  481. auto lastError = GetLastError();
  482. if (lastError != ERROR_HANDLE_EOF) {
  483. s = IOErrorFromWindowsError("ReadFile failed: " + filename_,
  484. lastError);
  485. }
  486. }
  487. *result = Slice(scratch, r);
  488. return s;
  489. }
  490. Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
  491. uint64_t offset, size_t& bytes_read) const {
  492. return pread(this, src, numBytes, offset, bytes_read);
  493. }
  494. Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result,
  495. char* scratch) {
  496. Status s;
  497. if (!WinFileData::use_direct_io()) {
  498. return Status::NotSupported("This function is only used for direct_io");
  499. }
  500. if (!IsSectorAligned(static_cast<size_t>(offset)) ||
  501. !IsSectorAligned(n)) {
  502. return Status::InvalidArgument(
  503. "WinSequentialFile::PositionedRead: offset is not properly aligned");
  504. }
  505. size_t bytes_read = 0; // out param
  506. s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset, bytes_read);
  507. *result = Slice(scratch, bytes_read);
  508. return s;
  509. }
  510. Status WinSequentialFile::Skip(uint64_t n) {
  511. // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit
  512. // integer. As such it is a highly unlikley case to have n so large.
  513. if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
  514. return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" +
  515. filename_);
  516. }
  517. LARGE_INTEGER li;
  518. li.QuadPart = static_cast<LONGLONG>(n); //cast is safe due to the check above
  519. BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
  520. if (ret == FALSE) {
  521. auto lastError = GetLastError();
  522. return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
  523. lastError);
  524. }
  525. return Status::OK();
  526. }
  527. Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
  528. return Status::OK();
  529. }
  530. //////////////////////////////////////////////////////////////////////////////////////////////////
  531. /// WinRandomAccessBase
  532. inline
  533. Status WinRandomAccessImpl::PositionedReadInternal(char* src,
  534. size_t numBytes,
  535. uint64_t offset,
  536. size_t& bytes_read) const {
  537. return pread(file_base_, src, numBytes, offset, bytes_read);
  538. }
  539. inline
  540. WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
  541. size_t alignment,
  542. const EnvOptions& options) :
  543. file_base_(file_base),
  544. alignment_(alignment) {
  545. assert(!options.use_mmap_reads);
  546. }
  547. inline
  548. Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
  549. char* scratch) const {
  550. Status s;
  551. // Check buffer alignment
  552. if (file_base_->use_direct_io()) {
  553. if (!IsSectorAligned(static_cast<size_t>(offset)) ||
  554. !IsAligned(alignment_, scratch)) {
  555. return Status::InvalidArgument(
  556. "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned");
  557. }
  558. }
  559. if (n == 0) {
  560. *result = Slice(scratch, 0);
  561. return s;
  562. }
  563. size_t bytes_read = 0;
  564. s = PositionedReadInternal(scratch, n, offset, bytes_read);
  565. *result = Slice(scratch, bytes_read);
  566. return s;
  567. }
  568. ///////////////////////////////////////////////////////////////////////////////////////////////////
  569. /// WinRandomAccessFile
  570. WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
  571. size_t alignment,
  572. const EnvOptions& options)
  573. : WinFileData(fname, hFile, options.use_direct_reads),
  574. WinRandomAccessImpl(this, alignment, options) {}
  575. WinRandomAccessFile::~WinRandomAccessFile() {
  576. }
  577. Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
  578. char* scratch) const {
  579. return ReadImpl(offset, n, result, scratch);
  580. }
  581. Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
  582. return Status::OK();
  583. }
  584. size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
  585. return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
  586. }
  587. size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
  588. return GetAlignment();
  589. }
  590. /////////////////////////////////////////////////////////////////////////////
  591. // WinWritableImpl
  592. //
  593. inline
  594. Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
  595. return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve);
  596. }
  597. inline
  598. WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
  599. : file_data_(file_data),
  600. alignment_(alignment),
  601. next_write_offset_(0),
  602. reservedsize_(0) {
  603. // Query current position in case ReopenWritableFile is called
  604. // This position is only important for buffered writes
  605. // for unbuffered writes we explicitely specify the position.
  606. LARGE_INTEGER zero_move;
  607. zero_move.QuadPart = 0; // Do not move
  608. LARGE_INTEGER pos;
  609. pos.QuadPart = 0;
  610. BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
  611. FILE_CURRENT);
  612. // Querying no supped to fail
  613. if (ret != 0) {
  614. next_write_offset_ = pos.QuadPart;
  615. } else {
  616. assert(false);
  617. }
  618. }
  619. inline
  620. Status WinWritableImpl::AppendImpl(const Slice& data) {
  621. Status s;
  622. if (data.size() > std::numeric_limits<DWORD>::max()) {
  623. return Status::InvalidArgument("data is too long for a single write" +
  624. file_data_->GetName());
  625. }
  626. size_t bytes_written = 0; // out param
  627. if (file_data_->use_direct_io()) {
  628. // With no offset specified we are appending
  629. // to the end of the file
  630. assert(IsSectorAligned(next_write_offset_));
  631. if (!IsSectorAligned(data.size()) ||
  632. !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
  633. s = Status::InvalidArgument(
  634. "WriteData must be page aligned, size must be sector aligned");
  635. } else {
  636. s = pwrite(file_data_, data, next_write_offset_, bytes_written);
  637. }
  638. } else {
  639. DWORD bytesWritten = 0;
  640. if (!WriteFile(file_data_->GetFileHandle(), data.data(),
  641. static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
  642. auto lastError = GetLastError();
  643. s = IOErrorFromWindowsError(
  644. "Failed to WriteFile: " + file_data_->GetName(),
  645. lastError);
  646. } else {
  647. bytes_written = bytesWritten;
  648. }
  649. }
  650. if(s.ok()) {
  651. if (bytes_written == data.size()) {
  652. // This matters for direct_io cases where
  653. // we rely on the fact that next_write_offset_
  654. // is sector aligned
  655. next_write_offset_ += bytes_written;
  656. } else {
  657. s = Status::IOError("Failed to write all bytes: " +
  658. file_data_->GetName());
  659. }
  660. }
  661. return s;
  662. }
  663. inline
  664. Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) {
  665. if(file_data_->use_direct_io()) {
  666. if (!IsSectorAligned(static_cast<size_t>(offset)) ||
  667. !IsSectorAligned(data.size()) ||
  668. !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
  669. return Status::InvalidArgument(
  670. "Data and offset must be page aligned, size must be sector aligned");
  671. }
  672. }
  673. size_t bytes_written = 0;
  674. Status s = pwrite(file_data_, data, offset, bytes_written);
  675. if(s.ok()) {
  676. if (bytes_written == data.size()) {
  677. // For sequential write this would be simple
  678. // size extension by data.size()
  679. uint64_t write_end = offset + bytes_written;
  680. if (write_end >= next_write_offset_) {
  681. next_write_offset_ = write_end;
  682. }
  683. } else {
  684. s = Status::IOError("Failed to write all of the requested data: " +
  685. file_data_->GetName());
  686. }
  687. }
  688. return s;
  689. }
  690. inline
  691. Status WinWritableImpl::TruncateImpl(uint64_t size) {
  692. // It is tempting to check for the size for sector alignment
  693. // but truncation may come at the end and there is not a requirement
  694. // for this to be sector aligned so long as we do not attempt to write
  695. // after that. The interface docs state that the behavior is undefined
  696. // in that case.
  697. Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(),
  698. size);
  699. if (s.ok()) {
  700. next_write_offset_ = size;
  701. }
  702. return s;
  703. }
  704. inline
  705. Status WinWritableImpl::CloseImpl() {
  706. Status s;
  707. auto hFile = file_data_->GetFileHandle();
  708. assert(INVALID_HANDLE_VALUE != hFile);
  709. if (!::FlushFileBuffers(hFile)) {
  710. auto lastError = GetLastError();
  711. s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " +
  712. file_data_->GetName(),
  713. lastError);
  714. }
  715. if(!file_data_->CloseFile() && s.ok()) {
  716. auto lastError = GetLastError();
  717. s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(),
  718. lastError);
  719. }
  720. return s;
  721. }
  722. inline
  723. Status WinWritableImpl::SyncImpl() {
  724. Status s;
  725. if (!::FlushFileBuffers (file_data_->GetFileHandle())) {
  726. auto lastError = GetLastError();
  727. s = IOErrorFromWindowsError(
  728. "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError);
  729. }
  730. return s;
  731. }
  732. inline
  733. Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
  734. Status status;
  735. TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds);
  736. // Make sure that we reserve an aligned amount of space
  737. // since the reservation block size is driven outside so we want
  738. // to check if we are ok with reservation here
  739. size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), static_cast<size_t>(alignment_));
  740. // Nothing to do
  741. if (spaceToReserve <= reservedsize_) {
  742. return status;
  743. }
  744. IOSTATS_TIMER_GUARD(allocate_nanos);
  745. status = PreallocateInternal(spaceToReserve);
  746. if (status.ok()) {
  747. reservedsize_ = spaceToReserve;
  748. }
  749. return status;
  750. }
  751. ////////////////////////////////////////////////////////////////////////////////
  752. /// WinWritableFile
  753. WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
  754. size_t alignment, size_t /* capacity */,
  755. const EnvOptions& options)
  756. : WinFileData(fname, hFile, options.use_direct_writes),
  757. WinWritableImpl(this, alignment),
  758. WritableFile(options) {
  759. assert(!options.use_mmap_writes);
  760. }
  761. WinWritableFile::~WinWritableFile() {
  762. }
  763. // Indicates if the class makes use of direct I/O
  764. bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); }
  765. size_t WinWritableFile::GetRequiredBufferAlignment() const {
  766. return static_cast<size_t>(GetAlignement());
  767. }
  768. Status WinWritableFile::Append(const Slice& data) {
  769. return AppendImpl(data);
  770. }
  771. Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
  772. return PositionedAppendImpl(data, offset);
  773. }
  774. // Need to implement this so the file is truncated correctly
  775. // when buffered and unbuffered mode
  776. Status WinWritableFile::Truncate(uint64_t size) {
  777. return TruncateImpl(size);
  778. }
  779. Status WinWritableFile::Close() {
  780. return CloseImpl();
  781. }
  782. // write out the cached data to the OS cache
  783. // This is now taken care of the WritableFileWriter
  784. Status WinWritableFile::Flush() {
  785. return Status::OK();
  786. }
  787. Status WinWritableFile::Sync() {
  788. return SyncImpl();
  789. }
  790. Status WinWritableFile::Fsync() { return SyncImpl(); }
  791. bool WinWritableFile::IsSyncThreadSafe() const { return true; }
  792. uint64_t WinWritableFile::GetFileSize() {
  793. return GetFileNextWriteOffset();
  794. }
  795. Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) {
  796. return AllocateImpl(offset, len);
  797. }
  798. size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
  799. return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
  800. }
  801. /////////////////////////////////////////////////////////////////////////
  802. /// WinRandomRWFile
  803. WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
  804. size_t alignment, const EnvOptions& options)
  805. : WinFileData(fname, hFile,
  806. options.use_direct_reads && options.use_direct_writes),
  807. WinRandomAccessImpl(this, alignment, options),
  808. WinWritableImpl(this, alignment) {}
  809. bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); }
  810. size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
  811. return static_cast<size_t>(GetAlignement());
  812. }
  813. Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
  814. return PositionedAppendImpl(data, offset);
  815. }
  816. Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
  817. char* scratch) const {
  818. return ReadImpl(offset, n, result, scratch);
  819. }
  820. Status WinRandomRWFile::Flush() {
  821. return Status::OK();
  822. }
  823. Status WinRandomRWFile::Sync() {
  824. return SyncImpl();
  825. }
  826. Status WinRandomRWFile::Close() {
  827. return CloseImpl();
  828. }
  829. //////////////////////////////////////////////////////////////////////////
  830. /// WinMemoryMappedBufer
  831. WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
  832. BOOL ret
  833. #if defined(_MSC_VER)
  834. = FALSE;
  835. #else
  836. __attribute__((__unused__));
  837. #endif
  838. if (base_ != nullptr) {
  839. ret = ::UnmapViewOfFile(base_);
  840. assert(ret);
  841. base_ = nullptr;
  842. }
  843. if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
  844. ret = ::CloseHandle(map_handle_);
  845. assert(ret);
  846. map_handle_ = NULL;
  847. }
  848. if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
  849. ret = ::CloseHandle(file_handle_);
  850. assert(ret);
  851. file_handle_ = NULL;
  852. }
  853. }
  854. //////////////////////////////////////////////////////////////////////////
  855. /// WinDirectory
  856. Status WinDirectory::Fsync() { return Status::OK(); }
  857. size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
  858. return GetUniqueIdFromFile(handle_, id, max_size);
  859. }
  860. //////////////////////////////////////////////////////////////////////////
  861. /// WinFileLock
  862. WinFileLock::~WinFileLock() {
  863. BOOL ret __attribute__((__unused__));
  864. ret = ::CloseHandle(hFile_);
  865. assert(ret);
  866. }
  867. }
  868. } // namespace ROCKSDB_NAMESPACE