io_win.cc 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #if defined(OS_WIN)
  10. #include "port/win/io_win.h"
  11. #include "env_win.h"
  12. #include "monitoring/iostats_context_imp.h"
  13. #include "test_util/sync_point.h"
  14. #include "util/aligned_buffer.h"
  15. #include "util/coding.h"
  16. namespace ROCKSDB_NAMESPACE {
  17. namespace port {
  18. /*
  19. * DirectIOHelper
  20. */
  21. namespace {
  22. const size_t kSectorSize = 512;
  23. inline bool IsPowerOfTwo(const size_t alignment) {
  24. return ((alignment) & (alignment - 1)) == 0;
  25. }
  26. inline bool IsAligned(size_t alignment, const void* ptr) {
  27. return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
  28. }
  29. } // namespace
  30. std::string GetWindowsErrSz(DWORD err) {
  31. std::string Err;
  32. LPSTR lpMsgBuf = nullptr;
  33. FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
  34. FORMAT_MESSAGE_IGNORE_INSERTS,
  35. NULL, err,
  36. 0, // Default language
  37. reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
  38. if (lpMsgBuf) {
  39. Err = lpMsgBuf;
  40. LocalFree(lpMsgBuf);
  41. }
  42. return Err;
  43. }
  44. // We preserve the original name of this interface to denote the original idea
  45. // behind it.
  46. // All reads happen by a specified offset and pwrite interface does not change
  47. // the position of the file pointer. Judging from the man page and errno it does
  48. // execute
  49. // lseek atomically to return the position of the file back where it was.
  50. // WriteFile() does not
  51. // have this capability. Therefore, for both pread and pwrite the pointer is
  52. // advanced to the next position
  53. // which is fine for writes because they are (should be) sequential.
  54. // Because all the reads/writes happen by the specified offset, the caller in
  55. // theory should not
  56. // rely on the current file offset.
  57. IOStatus pwrite(const WinFileData* file_data, const Slice& data,
  58. uint64_t offset, size_t& bytes_written) {
  59. IOStatus s;
  60. bytes_written = 0;
  61. size_t num_bytes = data.size();
  62. if (num_bytes > std::numeric_limits<DWORD>::max()) {
  63. // May happen in 64-bit builds where size_t is 64-bits but
  64. // long is still 32-bit, but that's the API here at the moment
  65. return IOStatus::InvalidArgument(
  66. "num_bytes is too large for a single write: " + file_data->GetName());
  67. }
  68. OVERLAPPED overlapped = {0};
  69. ULARGE_INTEGER offsetUnion;
  70. offsetUnion.QuadPart = offset;
  71. overlapped.Offset = offsetUnion.LowPart;
  72. overlapped.OffsetHigh = offsetUnion.HighPart;
  73. DWORD bytesWritten = 0;
  74. if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(),
  75. static_cast<DWORD>(num_bytes), &bytesWritten,
  76. &overlapped)) {
  77. auto lastError = GetLastError();
  78. s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
  79. lastError);
  80. } else {
  81. bytes_written = bytesWritten;
  82. }
  83. return s;
  84. }
  85. // See comments for pwrite above
  86. IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
  87. uint64_t offset, size_t& bytes_read) {
  88. IOStatus s;
  89. bytes_read = 0;
  90. if (num_bytes > std::numeric_limits<DWORD>::max()) {
  91. return IOStatus::InvalidArgument(
  92. "num_bytes is too large for a single read: " + file_data->GetName());
  93. }
  94. OVERLAPPED overlapped = {0};
  95. ULARGE_INTEGER offsetUnion;
  96. offsetUnion.QuadPart = offset;
  97. overlapped.Offset = offsetUnion.LowPart;
  98. overlapped.OffsetHigh = offsetUnion.HighPart;
  99. DWORD bytesRead = 0;
  100. if (FALSE == ReadFile(file_data->GetFileHandle(), src,
  101. static_cast<DWORD>(num_bytes), &bytesRead,
  102. &overlapped)) {
  103. auto lastError = GetLastError();
  104. // EOF is OK with zero bytes read
  105. if (lastError != ERROR_HANDLE_EOF) {
  106. s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
  107. lastError);
  108. }
  109. } else {
  110. bytes_read = bytesRead;
  111. }
  112. return s;
  113. }
  114. // SetFileInformationByHandle() is capable of fast pre-allocates.
  115. // However, this does not change the file end position unless the file is
  116. // truncated and the pre-allocated space is not considered filled with zeros.
  117. IOStatus fallocate(const std::string& filename, HANDLE hFile,
  118. uint64_t to_size) {
  119. IOStatus status;
  120. FILE_ALLOCATION_INFO alloc_info;
  121. alloc_info.AllocationSize.QuadPart = to_size;
  122. if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
  123. sizeof(FILE_ALLOCATION_INFO))) {
  124. auto lastError = GetLastError();
  125. status = IOErrorFromWindowsError(
  126. "Failed to pre-allocate space: " + filename, lastError);
  127. }
  128. return status;
  129. }
  130. IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) {
  131. IOStatus status;
  132. FILE_END_OF_FILE_INFO end_of_file;
  133. end_of_file.EndOfFile.QuadPart = toSize;
  134. if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
  135. sizeof(FILE_END_OF_FILE_INFO))) {
  136. auto lastError = GetLastError();
  137. status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
  138. lastError);
  139. }
  140. return status;
  141. }
  142. size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
  143. size_t /*max_size*/) {
  144. // Returning 0 is safe as it causes the table reader to generate a unique ID.
  145. // This is suboptimal for performance as it prevents multiple table readers
  146. // for the same file from sharing cached blocks. For example, if users have
  147. // a low value for `max_open_files`, there can be many table readers opened
  148. // for the same file.
  149. //
  150. // TODO: this is a temporarily solution as it is safe but not optimal for
  151. // performance. For more details see discussion in
  152. // https://github.com/facebook/rocksdb/pull/5844.
  153. return 0;
  154. }
  155. WinFileData::WinFileData(const std::string& filename, HANDLE hFile,
  156. bool direct_io)
  157. : filename_(filename),
  158. hFile_(hFile),
  159. use_direct_io_(direct_io),
  160. sector_size_(WinFileSystem::GetSectorSize(filename)) {}
  161. bool WinFileData::IsSectorAligned(const size_t off) const {
  162. return (off & (sector_size_ - 1)) == 0;
  163. }
  164. ////////////////////////////////////////////////////////////////////////////////////////////////////
  165. // WinMmapReadableFile
  166. WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
  167. HANDLE hFile, HANDLE hMap,
  168. const void* mapped_region,
  169. size_t length)
  170. : WinFileData(fileName, hFile, false /* use_direct_io */),
  171. hMap_(hMap),
  172. mapped_region_(mapped_region),
  173. length_(length) {}
  174. WinMmapReadableFile::~WinMmapReadableFile() {
  175. BOOL ret __attribute__((__unused__));
  176. ret = ::UnmapViewOfFile(mapped_region_);
  177. assert(ret);
  178. ret = ::CloseHandle(hMap_);
  179. assert(ret);
  180. }
  181. IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n,
  182. const IOOptions& /*options*/, Slice* result,
  183. char* scratch,
  184. IODebugContext* /*dbg*/) const {
  185. IOStatus s;
  186. if (offset > length_) {
  187. *result = Slice();
  188. return IOError(filename_, EINVAL);
  189. } else if (offset + n > length_) {
  190. n = length_ - static_cast<size_t>(offset);
  191. }
  192. *result = Slice(static_cast<const char*>(mapped_region_) + offset, n);
  193. return s;
  194. }
  195. IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
  196. return IOStatus::OK();
  197. }
  198. size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
  199. return GetUniqueIdFromFile(hFile_, id, max_size);
  200. }
  201. IOStatus WinMmapReadableFile::GetFileSize(uint64_t* size) {
  202. LARGE_INTEGER fileSize;
  203. if (GetFileSizeEx(hFile_, &fileSize)) {
  204. *size = fileSize.QuadPart;
  205. return IOStatus::OK();
  206. } else {
  207. return IOStatus::IOError("Failed to get file size", filename_);
  208. }
  209. }
  210. ///////////////////////////////////////////////////////////////////////////////
  211. /// WinMmapFile
  212. // Can only truncate or reserve to a sector size aligned if
  213. // used on files that are opened with Unbuffered I/O
  214. IOStatus WinMmapFile::TruncateFile(uint64_t toSize) {
  215. return ftruncate(filename_, hFile_, toSize);
  216. }
  217. IOStatus WinMmapFile::UnmapCurrentRegion() {
  218. IOStatus status;
  219. if (mapped_begin_ != nullptr) {
  220. if (!::UnmapViewOfFile(mapped_begin_)) {
  221. status = IOErrorFromWindowsError(
  222. "Failed to unmap file view: " + filename_, GetLastError());
  223. }
  224. // Move on to the next portion of the file
  225. file_offset_ += view_size_;
  226. // UnmapView automatically sends data to disk but not the metadata
  227. // which is good and provides some equivalent of fdatasync() on Linux
  228. // therefore, we donot need separate flag for metadata
  229. mapped_begin_ = nullptr;
  230. mapped_end_ = nullptr;
  231. dst_ = nullptr;
  232. last_sync_ = nullptr;
  233. pending_sync_ = false;
  234. }
  235. return status;
  236. }
  237. IOStatus WinMmapFile::MapNewRegion(const IOOptions& options,
  238. IODebugContext* dbg) {
  239. IOStatus status;
  240. assert(mapped_begin_ == nullptr);
  241. size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
  242. if (minDiskSize > reserved_size_) {
  243. status = Allocate(file_offset_, view_size_, options, dbg);
  244. if (!status.ok()) {
  245. return status;
  246. }
  247. }
  248. // Need to remap
  249. if (hMap_ == NULL || reserved_size_ > mapping_size_) {
  250. if (hMap_ != NULL) {
  251. // Unmap the previous one
  252. BOOL ret __attribute__((__unused__));
  253. ret = ::CloseHandle(hMap_);
  254. assert(ret);
  255. hMap_ = NULL;
  256. }
  257. ULARGE_INTEGER mappingSize;
  258. mappingSize.QuadPart = reserved_size_;
  259. hMap_ = CreateFileMappingA(
  260. hFile_,
  261. NULL, // Security attributes
  262. PAGE_READWRITE, // There is not a write only mode for mapping
  263. mappingSize.HighPart, // Enable mapping the whole file but the actual
  264. // amount mapped is determined by MapViewOfFile
  265. mappingSize.LowPart,
  266. NULL); // Mapping name
  267. if (NULL == hMap_) {
  268. return IOErrorFromWindowsError(
  269. "WindowsMmapFile failed to create file mapping for: " + filename_,
  270. GetLastError());
  271. }
  272. mapping_size_ = reserved_size_;
  273. }
  274. ULARGE_INTEGER offset;
  275. offset.QuadPart = file_offset_;
  276. // View must begin at the granularity aligned offset
  277. mapped_begin_ =
  278. static_cast<char*>(MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart,
  279. offset.LowPart, view_size_, NULL));
  280. if (!mapped_begin_) {
  281. status = IOErrorFromWindowsError(
  282. "WindowsMmapFile failed to map file view: " + filename_,
  283. GetLastError());
  284. } else {
  285. mapped_end_ = mapped_begin_ + view_size_;
  286. dst_ = mapped_begin_;
  287. last_sync_ = mapped_begin_;
  288. pending_sync_ = false;
  289. }
  290. return status;
  291. }
  292. IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
  293. return fallocate(filename_, hFile_, spaceToReserve);
  294. }
  295. WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
  296. size_t page_size, size_t allocation_granularity,
  297. const FileOptions& options)
  298. : WinFileData(fname, hFile, false),
  299. FSWritableFile(options),
  300. hMap_(NULL),
  301. page_size_(page_size),
  302. allocation_granularity_(allocation_granularity),
  303. reserved_size_(0),
  304. mapping_size_(0),
  305. view_size_(0),
  306. mapped_begin_(nullptr),
  307. mapped_end_(nullptr),
  308. dst_(nullptr),
  309. last_sync_(nullptr),
  310. file_offset_(0),
  311. pending_sync_(false) {
  312. // Allocation granularity must be obtained from GetSystemInfo() and must be
  313. // a power of two.
  314. assert(allocation_granularity > 0);
  315. assert((allocation_granularity & (allocation_granularity - 1)) == 0);
  316. assert(page_size > 0);
  317. assert((page_size & (page_size - 1)) == 0);
  318. // Only for memory mapped writes
  319. assert(options.use_mmap_writes);
  320. // View size must be both the multiple of allocation_granularity AND the
  321. // page size and the granularity is usually a multiple of a page size.
  322. const size_t viewSize =
  323. 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
  324. view_size_ = Roundup(viewSize, allocation_granularity_);
  325. }
  326. WinMmapFile::~WinMmapFile() {
  327. if (hFile_) {
  328. this->Close(IOOptions(), nullptr);
  329. }
  330. }
  331. IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options,
  332. IODebugContext* dbg) {
  333. const char* src = data.data();
  334. size_t left = data.size();
  335. while (left > 0) {
  336. assert(mapped_begin_ <= dst_);
  337. size_t avail = mapped_end_ - dst_;
  338. if (avail == 0) {
  339. IOStatus s = UnmapCurrentRegion();
  340. if (s.ok()) {
  341. s = MapNewRegion(options, dbg);
  342. }
  343. if (!s.ok()) {
  344. return s;
  345. }
  346. } else {
  347. size_t n = std::min(left, avail);
  348. memcpy(dst_, src, n);
  349. dst_ += n;
  350. src += n;
  351. left -= n;
  352. pending_sync_ = true;
  353. }
  354. }
  355. // Now make sure that the last partial page is padded with zeros if needed
  356. size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
  357. if (bytesToPad > 0) {
  358. memset(dst_, 0, bytesToPad);
  359. }
  360. return IOStatus::OK();
  361. }
  362. // Means Close() will properly take care of truncate
  363. // and it does not need any additional information
  364. IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/,
  365. IODebugContext* /*dbg*/) {
  366. return IOStatus::OK();
  367. }
  368. IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) {
  369. IOStatus s;
  370. assert(NULL != hFile_);
  371. // We truncate to the precise size so no
  372. // uninitialized data at the end. SetEndOfFile
  373. // which we use does not write zeros and it is good.
  374. uint64_t targetSize = GetFileSize(options, dbg);
  375. if (mapped_begin_ != nullptr) {
  376. // Sync before unmapping to make sure everything
  377. // is on disk and there is not a lazy writing
  378. // so we are deterministic with the tests
  379. Sync(options, dbg);
  380. s = UnmapCurrentRegion();
  381. }
  382. if (NULL != hMap_) {
  383. BOOL ret = ::CloseHandle(hMap_);
  384. if (!ret && s.ok()) {
  385. auto lastError = GetLastError();
  386. s = IOErrorFromWindowsError(
  387. "Failed to Close mapping for file: " + filename_, lastError);
  388. }
  389. hMap_ = NULL;
  390. }
  391. if (hFile_ != NULL) {
  392. TruncateFile(targetSize);
  393. BOOL ret = ::CloseHandle(hFile_);
  394. hFile_ = NULL;
  395. if (!ret && s.ok()) {
  396. auto lastError = GetLastError();
  397. s = IOErrorFromWindowsError(
  398. "Failed to close file map handle: " + filename_, lastError);
  399. }
  400. }
  401. return s;
  402. }
  403. IOStatus WinMmapFile::Flush(const IOOptions& /*options*/,
  404. IODebugContext* /*dbg*/) {
  405. return IOStatus::OK();
  406. }
  407. // Flush only data
  408. IOStatus WinMmapFile::Sync(const IOOptions& /*options*/,
  409. IODebugContext* /*dbg*/) {
  410. IOStatus s;
  411. // Some writes occurred since last sync
  412. if (dst_ > last_sync_) {
  413. assert(mapped_begin_);
  414. assert(dst_);
  415. assert(dst_ > mapped_begin_);
  416. assert(dst_ < mapped_end_);
  417. size_t page_begin =
  418. TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
  419. size_t page_end =
  420. TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
  421. // Flush only the amount of that is a multiple of pages
  422. if (!::FlushViewOfFile(mapped_begin_ + page_begin,
  423. (page_end - page_begin) + page_size_)) {
  424. s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
  425. GetLastError());
  426. } else {
  427. last_sync_ = dst_;
  428. }
  429. }
  430. return s;
  431. }
  432. /**
  433. * Flush data as well as metadata to stable storage.
  434. */
  435. IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
  436. IOStatus s = Sync(options, dbg);
  437. // Flush metadata
  438. if (s.ok() && pending_sync_) {
  439. if (!::FlushFileBuffers(hFile_)) {
  440. s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
  441. GetLastError());
  442. }
  443. pending_sync_ = false;
  444. }
  445. return s;
  446. }
  447. /**
  448. * Get the size of valid data in the file. This will not match the
  449. * size that is returned from the filesystem because we use mmap
  450. * to extend file by map_size every time.
  451. */
  452. uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/,
  453. IODebugContext* /*dbg*/) {
  454. size_t used = dst_ - mapped_begin_;
  455. return file_offset_ + used;
  456. }
  457. IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) {
  458. return IOStatus::OK();
  459. }
  460. IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len,
  461. const IOOptions& /*options*/,
  462. IODebugContext* /*dbg*/) {
  463. IOStatus status;
  464. TEST_KILL_RANDOM("WinMmapFile::Allocate");
  465. // Make sure that we reserve an aligned amount of space
  466. // since the reservation block size is driven outside so we want
  467. // to check if we are ok with reservation here
  468. size_t spaceToReserve =
  469. Roundup(static_cast<size_t>(offset + len), view_size_);
  470. // Nothing to do
  471. if (spaceToReserve <= reserved_size_) {
  472. return status;
  473. }
  474. IOSTATS_TIMER_GUARD(allocate_nanos);
  475. status = PreallocateInternal(spaceToReserve);
  476. if (status.ok()) {
  477. reserved_size_ = spaceToReserve;
  478. }
  479. return status;
  480. }
  481. size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
  482. return GetUniqueIdFromFile(hFile_, id, max_size);
  483. }
  484. //////////////////////////////////////////////////////////////////////////////////
  485. // WinSequentialFile
  486. WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
  487. const FileOptions& options)
  488. : WinFileData(fname, f, options.use_direct_reads) {}
  489. WinSequentialFile::~WinSequentialFile() {
  490. assert(hFile_ != INVALID_HANDLE_VALUE);
  491. }
  492. IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
  493. Slice* result, char* scratch,
  494. IODebugContext* /*dbg*/) {
  495. IOStatus s;
  496. size_t r = 0;
  497. assert(result != nullptr);
  498. if (WinFileData::use_direct_io()) {
  499. return IOStatus::NotSupported("Read() does not support direct_io");
  500. }
  501. // Windows ReadFile API accepts a DWORD.
  502. // While it is possible to read in a loop if n is too big
  503. // it is an unlikely case.
  504. if (n > std::numeric_limits<DWORD>::max()) {
  505. return IOStatus::InvalidArgument("n is too big for a single ReadFile: " +
  506. filename_);
  507. }
  508. DWORD bytesToRead =
  509. static_cast<DWORD>(n); // cast is safe due to the check above
  510. DWORD bytesRead = 0;
  511. BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
  512. if (ret != FALSE) {
  513. r = bytesRead;
  514. } else {
  515. auto lastError = GetLastError();
  516. if (lastError != ERROR_HANDLE_EOF) {
  517. s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError);
  518. }
  519. }
  520. *result = Slice(scratch, r);
  521. return s;
  522. }
  523. IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
  524. uint64_t offset,
  525. size_t& bytes_read) const {
  526. return pread(this, src, numBytes, offset, bytes_read);
  527. }
  528. IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n,
  529. const IOOptions& /*opts*/,
  530. Slice* result, char* scratch,
  531. IODebugContext* /*dbg*/) {
  532. if (!WinFileData::use_direct_io()) {
  533. return IOStatus::NotSupported("This function is only used for direct_io");
  534. }
  535. assert(IsSectorAligned(static_cast<size_t>(offset)));
  536. assert(IsSectorAligned(static_cast<size_t>(n)));
  537. size_t bytes_read = 0; // out param
  538. IOStatus s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset,
  539. bytes_read);
  540. *result = Slice(scratch, bytes_read);
  541. return s;
  542. }
  543. IOStatus WinSequentialFile::Skip(uint64_t n) {
  544. // Can't handle more than signed max as SetFilePointerEx accepts a signed
  545. // 64-bit integer. As such it is a highly unlikley case to have n so large.
  546. if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
  547. return IOStatus::InvalidArgument(
  548. "n is too large for a single SetFilePointerEx() call" + filename_);
  549. }
  550. LARGE_INTEGER li;
  551. li.QuadPart = static_cast<LONGLONG>(n); // cast is safe due to the check
  552. // above
  553. BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
  554. if (ret == FALSE) {
  555. auto lastError = GetLastError();
  556. return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
  557. lastError);
  558. }
  559. return IOStatus::OK();
  560. }
  561. IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
  562. return IOStatus::OK();
  563. }
  564. //////////////////////////////////////////////////////////////////////////////////////////////////
  565. /// WinRandomAccessBase
  566. inline IOStatus WinRandomAccessImpl::PositionedReadInternal(
  567. char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const {
  568. return pread(file_base_, src, numBytes, offset, bytes_read);
  569. }
  570. inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
  571. size_t alignment,
  572. const FileOptions& options)
  573. : file_base_(file_base),
  574. alignment_(std::max(alignment, file_base->GetSectorSize())) {
  575. assert(!options.use_mmap_reads);
  576. }
  577. inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n,
  578. Slice* result,
  579. char* scratch) const {
  580. // Check buffer alignment
  581. if (file_base_->use_direct_io()) {
  582. assert(file_base_->IsSectorAligned(static_cast<size_t>(offset)));
  583. assert(IsAligned(alignment_, scratch));
  584. }
  585. if (n == 0) {
  586. *result = Slice(scratch, 0);
  587. return IOStatus::OK();
  588. }
  589. size_t bytes_read = 0;
  590. IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read);
  591. *result = Slice(scratch, bytes_read);
  592. return s;
  593. }
  594. ///////////////////////////////////////////////////////////////////////////////////////////////////
  595. /// WinRandomAccessFile
  596. WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
  597. size_t alignment,
  598. const FileOptions& options)
  599. : WinFileData(fname, hFile, options.use_direct_reads),
  600. WinRandomAccessImpl(this, alignment, options) {}
  601. WinRandomAccessFile::~WinRandomAccessFile() {}
  602. IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n,
  603. const IOOptions& /*options*/, Slice* result,
  604. char* scratch,
  605. IODebugContext* /*dbg*/) const {
  606. return ReadImpl(offset, n, result, scratch);
  607. }
  608. IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
  609. return IOStatus::OK();
  610. }
  611. size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
  612. return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
  613. }
  614. size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
  615. return GetAlignment();
  616. }
  617. IOStatus WinRandomAccessFile::GetFileSize(uint64_t* size) {
  618. LARGE_INTEGER fileSize;
  619. if (GetFileSizeEx(hFile_, &fileSize)) {
  620. *size = fileSize.QuadPart;
  621. return IOStatus::OK();
  622. } else {
  623. return IOStatus::IOError("Failed to get file size", filename_);
  624. }
  625. }
  626. /////////////////////////////////////////////////////////////////////////////
  627. // WinWritableImpl
  628. //
  629. inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
  630. return fallocate(file_data_->GetName(), file_data_->GetFileHandle(),
  631. spaceToReserve);
  632. }
  633. inline WinWritableImpl::WinWritableImpl(WinFileData* file_data,
  634. size_t alignment)
  635. : file_data_(file_data),
  636. alignment_(std::max(alignment, file_data->GetSectorSize())),
  637. next_write_offset_(0),
  638. reservedsize_(0) {
  639. // Query current position in case ReopenWritableFile is called
  640. // This position is only important for buffered writes
  641. // for unbuffered writes we explicitely specify the position.
  642. LARGE_INTEGER zero_move;
  643. zero_move.QuadPart = 0; // Do not move
  644. LARGE_INTEGER pos;
  645. pos.QuadPart = 0;
  646. BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
  647. FILE_CURRENT);
  648. // Querying no supped to fail
  649. if (ret != 0) {
  650. next_write_offset_ = pos.QuadPart;
  651. } else {
  652. assert(false);
  653. }
  654. }
  655. inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) {
  656. IOStatus s;
  657. if (data.size() > std::numeric_limits<DWORD>::max()) {
  658. return IOStatus::InvalidArgument("data is too long for a single write" +
  659. file_data_->GetName());
  660. }
  661. size_t bytes_written = 0; // out param
  662. if (file_data_->use_direct_io()) {
  663. // With no offset specified we are appending
  664. // to the end of the file
  665. assert(file_data_->IsSectorAligned(next_write_offset_));
  666. assert(file_data_->IsSectorAligned(data.size()));
  667. assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
  668. s = pwrite(file_data_, data, next_write_offset_, bytes_written);
  669. } else {
  670. DWORD bytesWritten = 0;
  671. if (!WriteFile(file_data_->GetFileHandle(), data.data(),
  672. static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
  673. auto lastError = GetLastError();
  674. s = IOErrorFromWindowsError(
  675. "Failed to WriteFile: " + file_data_->GetName(), lastError);
  676. } else {
  677. bytes_written = bytesWritten;
  678. }
  679. }
  680. if (s.ok()) {
  681. if (bytes_written == data.size()) {
  682. // This matters for direct_io cases where
  683. // we rely on the fact that next_write_offset_
  684. // is sector aligned
  685. next_write_offset_ += bytes_written;
  686. } else {
  687. s = IOStatus::IOError("Failed to write all bytes: " +
  688. file_data_->GetName());
  689. }
  690. }
  691. return s;
  692. }
  693. inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data,
  694. uint64_t offset) {
  695. if (file_data_->use_direct_io()) {
  696. assert(file_data_->IsSectorAligned(static_cast<size_t>(offset)));
  697. assert(file_data_->IsSectorAligned(data.size()));
  698. assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
  699. }
  700. size_t bytes_written = 0;
  701. IOStatus s = pwrite(file_data_, data, offset, bytes_written);
  702. if (s.ok()) {
  703. if (bytes_written == data.size()) {
  704. // For sequential write this would be simple
  705. // size extension by data.size()
  706. uint64_t write_end = offset + bytes_written;
  707. if (write_end >= next_write_offset_) {
  708. next_write_offset_ = write_end;
  709. }
  710. } else {
  711. s = IOStatus::IOError("Failed to write all of the requested data: " +
  712. file_data_->GetName());
  713. }
  714. }
  715. return s;
  716. }
  717. inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) {
  718. // It is tempting to check for the size for sector alignment
  719. // but truncation may come at the end and there is not a requirement
  720. // for this to be sector aligned so long as we do not attempt to write
  721. // after that. The interface docs state that the behavior is undefined
  722. // in that case.
  723. IOStatus s =
  724. ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size);
  725. if (s.ok()) {
  726. next_write_offset_ = size;
  727. }
  728. return s;
  729. }
  730. inline IOStatus WinWritableImpl::CloseImpl() {
  731. IOStatus s;
  732. auto hFile = file_data_->GetFileHandle();
  733. assert(INVALID_HANDLE_VALUE != hFile);
  734. if (!::FlushFileBuffers(hFile)) {
  735. auto lastError = GetLastError();
  736. s = IOErrorFromWindowsError(
  737. "FlushFileBuffers failed at Close() for: " + file_data_->GetName(),
  738. lastError);
  739. }
  740. if (!file_data_->CloseFile() && s.ok()) {
  741. auto lastError = GetLastError();
  742. s = IOErrorFromWindowsError(
  743. "CloseHandle failed for: " + file_data_->GetName(), lastError);
  744. }
  745. return s;
  746. }
  747. inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/,
  748. IODebugContext* /*dbg*/) {
  749. IOStatus s;
  750. if (!::FlushFileBuffers(file_data_->GetFileHandle())) {
  751. auto lastError = GetLastError();
  752. s = IOErrorFromWindowsError(
  753. "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(),
  754. lastError);
  755. }
  756. return s;
  757. }
  758. inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
  759. IOStatus status;
  760. TEST_KILL_RANDOM("WinWritableFile::Allocate");
  761. // Make sure that we reserve an aligned amount of space
  762. // since the reservation block size is driven outside so we want
  763. // to check if we are ok with reservation here
  764. size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len),
  765. static_cast<size_t>(alignment_));
  766. // Nothing to do
  767. if (spaceToReserve <= reservedsize_) {
  768. return status;
  769. }
  770. IOSTATS_TIMER_GUARD(allocate_nanos);
  771. status = PreallocateInternal(spaceToReserve);
  772. if (status.ok()) {
  773. reservedsize_ = spaceToReserve;
  774. }
  775. return status;
  776. }
  777. ////////////////////////////////////////////////////////////////////////////////
  778. /// WinWritableFile
  779. WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
  780. size_t alignment, size_t /* capacity */,
  781. const FileOptions& options)
  782. : WinFileData(fname, hFile, options.use_direct_writes),
  783. WinWritableImpl(this, alignment),
  784. FSWritableFile(options) {
  785. assert(!options.use_mmap_writes);
  786. }
  787. WinWritableFile::~WinWritableFile() {}
  788. // Indicates if the class makes use of direct I/O
  789. bool WinWritableFile::use_direct_io() const {
  790. return WinFileData::use_direct_io();
  791. }
  792. size_t WinWritableFile::GetRequiredBufferAlignment() const {
  793. return static_cast<size_t>(GetAlignment());
  794. }
  795. IOStatus WinWritableFile::Append(const Slice& data,
  796. const IOOptions& /*options*/,
  797. IODebugContext* /*dbg*/) {
  798. return AppendImpl(data);
  799. }
  800. IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
  801. const IOOptions& /*options*/,
  802. IODebugContext* /*dbg*/) {
  803. return PositionedAppendImpl(data, offset);
  804. }
  805. // Need to implement this so the file is truncated correctly
  806. // when buffered and unbuffered mode
  807. IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/,
  808. IODebugContext* /*dbg*/) {
  809. return TruncateImpl(size);
  810. }
  811. IOStatus WinWritableFile::Close(const IOOptions& /*options*/,
  812. IODebugContext* /*dbg*/) {
  813. return CloseImpl();
  814. }
  815. // write out the cached data to the OS cache
  816. // This is now taken care of the WritableFileWriter
  817. IOStatus WinWritableFile::Flush(const IOOptions& /*options*/,
  818. IODebugContext* /*dbg*/) {
  819. return IOStatus::OK();
  820. }
  821. IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) {
  822. return SyncImpl(options, dbg);
  823. }
  824. IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
  825. return SyncImpl(options, dbg);
  826. }
  827. bool WinWritableFile::IsSyncThreadSafe() const { return true; }
  828. uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/,
  829. IODebugContext* /*dbg*/) {
  830. return GetFileNextWriteOffset();
  831. }
  832. IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len,
  833. const IOOptions& /*options*/,
  834. IODebugContext* /*dbg*/) {
  835. return AllocateImpl(offset, len);
  836. }
  837. size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
  838. return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
  839. }
  840. /////////////////////////////////////////////////////////////////////////
  841. /// WinRandomRWFile
  842. WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
  843. size_t alignment, const FileOptions& options)
  844. : WinFileData(fname, hFile,
  845. options.use_direct_reads && options.use_direct_writes),
  846. WinRandomAccessImpl(this, alignment, options),
  847. WinWritableImpl(this, alignment) {}
  848. bool WinRandomRWFile::use_direct_io() const {
  849. return WinFileData::use_direct_io();
  850. }
  851. size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
  852. assert(WinRandomAccessImpl::GetAlignment() ==
  853. WinWritableImpl::GetAlignment());
  854. return static_cast<size_t>(WinRandomAccessImpl::GetAlignment());
  855. }
  856. IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data,
  857. const IOOptions& /*options*/,
  858. IODebugContext* /*dbg*/) {
  859. return PositionedAppendImpl(data, offset);
  860. }
  861. IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n,
  862. const IOOptions& /*options*/, Slice* result,
  863. char* scratch, IODebugContext* /*dbg*/) const {
  864. return ReadImpl(offset, n, result, scratch);
  865. }
  866. IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/,
  867. IODebugContext* /*dbg*/) {
  868. return IOStatus::OK();
  869. }
  870. IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) {
  871. return SyncImpl(options, dbg);
  872. }
  873. IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/,
  874. IODebugContext* /*dbg*/) {
  875. return CloseImpl();
  876. }
  877. //////////////////////////////////////////////////////////////////////////
  878. /// WinMemoryMappedBufer
  879. WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
  880. BOOL ret
  881. #if defined(_MSC_VER)
  882. = FALSE;
  883. #else
  884. __attribute__((__unused__));
  885. #endif
  886. if (base_ != nullptr) {
  887. ret = ::UnmapViewOfFile(base_);
  888. assert(ret);
  889. base_ = nullptr;
  890. }
  891. if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
  892. ret = ::CloseHandle(map_handle_);
  893. assert(ret);
  894. map_handle_ = NULL;
  895. }
  896. if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
  897. ret = ::CloseHandle(file_handle_);
  898. assert(ret);
  899. file_handle_ = NULL;
  900. }
  901. }
  902. //////////////////////////////////////////////////////////////////////////
  903. /// WinDirectory
  904. IOStatus WinDirectory::Fsync(const IOOptions& /*options*/,
  905. IODebugContext* /*dbg*/) {
  906. return IOStatus::OK();
  907. }
  908. IOStatus WinDirectory::Close(const IOOptions& /*options*/,
  909. IODebugContext* /*dbg*/) {
  910. IOStatus s = IOStatus::OK();
  911. BOOL ret __attribute__((__unused__));
  912. if (handle_ != INVALID_HANDLE_VALUE) {
  913. ret = ::CloseHandle(handle_);
  914. if (!ret) {
  915. auto lastError = GetLastError();
  916. s = IOErrorFromWindowsError("Directory closes failed for : " + GetName(),
  917. lastError);
  918. }
  919. handle_ = NULL;
  920. }
  921. return s;
  922. }
  923. size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
  924. return GetUniqueIdFromFile(handle_, id, max_size);
  925. }
  926. //////////////////////////////////////////////////////////////////////////
  927. /// WinFileLock
  928. WinFileLock::~WinFileLock() {
  929. BOOL ret __attribute__((__unused__));
  930. ret = ::CloseHandle(hFile_);
  931. assert(ret);
  932. }
  933. } // namespace port
  934. } // namespace ROCKSDB_NAMESPACE
  935. #endif