memory.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592
  1. import collections
  2. import contextlib
  3. import warnings
  4. from typing import Any, Dict, Union
  5. import torch
  6. from . import is_initialized, _get_device_index, _lazy_init
  7. from torch.types import Device
  8. __all__ = ["caching_allocator_alloc", "caching_allocator_delete", "set_per_process_memory_fraction",
  9. "empty_cache", "memory_stats", "memory_stats_as_nested_dict", "reset_accumulated_memory_stats",
  10. "reset_peak_memory_stats", "reset_max_memory_allocated", "reset_max_memory_cached",
  11. "memory_allocated", "max_memory_allocated", "memory_reserved", "max_memory_reserved",
  12. "memory_cached", "max_memory_cached", "memory_snapshot", "memory_summary", "list_gpu_processes",
  13. "mem_get_info"]
  14. def _host_allocator():
  15. _lazy_init()
  16. return torch._C._cuda_cudaHostAllocator()
  17. @contextlib.contextmanager
  18. def _free_mutex():
  19. torch._C._cuda_lock_mutex()
  20. try:
  21. yield
  22. finally:
  23. torch._C._cuda_unlock_mutex()
  24. def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None):
  25. r"""Performs a memory allocation using the CUDA memory allocator.
  26. Memory is allocated for a given device and a stream, this
  27. function is intended to be used for interoperability with other
  28. frameworks. Allocated memory is released through
  29. :func:`~torch.cuda.caching_allocator_delete`.
  30. Args:
  31. size (int): number of bytes to be allocated.
  32. device (torch.device or int, optional): selected device. If it is
  33. ``None`` the default CUDA device is used.
  34. stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then
  35. the default stream for the selected device is used.
  36. .. note::
  37. See :ref:`cuda-memory-management` for more details about GPU memory
  38. management.
  39. """
  40. if device is None:
  41. device = torch.cuda.current_device()
  42. device = _get_device_index(device)
  43. if stream is None:
  44. stream = torch.cuda.current_stream(device)
  45. if isinstance(stream, torch.cuda.streams.Stream):
  46. stream = stream.cuda_stream
  47. if not isinstance(stream, int):
  48. raise TypeError('Invalid type for stream argument, must be '
  49. '`torch.cuda.Stream` or `int` representing a pointer '
  50. 'to a exisiting stream')
  51. with torch.cuda.device(device):
  52. return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream)
  53. def caching_allocator_delete(mem_ptr):
  54. r"""Deletes memory allocated using the CUDA memory allocator.
  55. Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`.
  56. is freed here. The associated device and stream are tracked inside
  57. the allocator.
  58. Args:
  59. mem_ptr (int): memory address to be freed by the allocator.
  60. .. note::
  61. See :ref:`cuda-memory-management` for more details about GPU memory
  62. management.
  63. """
  64. torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr)
  65. def set_per_process_memory_fraction(fraction, device: Union[Device, int] = None) -> None:
  66. r"""Set memory fraction for a process.
  67. The fraction is used to limit an caching allocator to allocated memory on a CUDA device.
  68. The allowed value equals the total visible memory multiplied fraction.
  69. If trying to allocate more than the allowed value in a process, will raise an out of
  70. memory error in allocator.
  71. Args:
  72. fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
  73. device (torch.device or int, optional): selected device. If it is
  74. ``None`` the default CUDA device is used.
  75. .. note::
  76. In general, the total available free memory is less than the total capacity.
  77. """
  78. _lazy_init()
  79. if device is None:
  80. device = torch.cuda.current_device()
  81. device = _get_device_index(device)
  82. if not isinstance(fraction, float):
  83. raise TypeError('Invalid type for fraction argument, must be `float`')
  84. if fraction < 0 or fraction > 1:
  85. raise ValueError('Invalid fraction value: {}. '
  86. 'Allowed range: 0~1'.format(fraction))
  87. torch._C._cuda_setMemoryFraction(fraction, device)
  88. def empty_cache() -> None:
  89. r"""Releases all unoccupied cached memory currently held by the caching
  90. allocator so that those can be used in other GPU application and visible in
  91. `nvidia-smi`.
  92. .. note::
  93. :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU
  94. memory available for PyTorch. However, it may help reduce fragmentation
  95. of GPU memory in certain cases. See :ref:`cuda-memory-management` for
  96. more details about GPU memory management.
  97. """
  98. if is_initialized():
  99. torch._C._cuda_emptyCache()
  100. def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
  101. r"""Returns a dictionary of CUDA memory allocator statistics for a
  102. given device.
  103. The return value of this function is a dictionary of statistics, each of
  104. which is a non-negative integer.
  105. Core statistics:
  106. - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
  107. number of allocation requests received by the memory allocator.
  108. - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
  109. amount of allocated memory.
  110. - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
  111. number of reserved segments from ``cudaMalloc()``.
  112. - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
  113. amount of reserved memory.
  114. - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
  115. number of active memory blocks.
  116. - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
  117. amount of active memory.
  118. - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
  119. number of inactive, non-releasable memory blocks.
  120. - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
  121. amount of inactive, non-releasable memory.
  122. For these core statistics, values are broken down as follows.
  123. Pool type:
  124. - ``all``: combined statistics across all memory pools.
  125. - ``large_pool``: statistics for the large allocation pool
  126. (as of October 2019, for size >= 1MB allocations).
  127. - ``small_pool``: statistics for the small allocation pool
  128. (as of October 2019, for size < 1MB allocations).
  129. Metric type:
  130. - ``current``: current value of this metric.
  131. - ``peak``: maximum value of this metric.
  132. - ``allocated``: historical total increase in this metric.
  133. - ``freed``: historical total decrease in this metric.
  134. In addition to the core statistics, we also provide some simple event
  135. counters:
  136. - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that
  137. result in a cache flush and retry.
  138. - ``"num_ooms"``: number of out-of-memory errors thrown.
  139. The caching allocator can be configured via ENV to not split blocks larger than a
  140. defined size (see Memory Management section of the Cuda Semantics documentation).
  141. This helps avoid memory framentation but may have a performance
  142. penalty. Additional outputs to assist with tuning and evaluating impact:
  143. - ``"max_split_size"``: blocks above this size will not be split.
  144. - ``"oversize_allocations.{current,peak,allocated,freed}"``:
  145. number of over-size allocation requests received by the memory allocator.
  146. - ``"oversize_segments.{current,peak,allocated,freed}"``:
  147. number of over-size reserved segments from ``cudaMalloc()``.
  148. Args:
  149. device (torch.device or int, optional): selected device. Returns
  150. statistics for the current device, given by :func:`~torch.cuda.current_device`,
  151. if :attr:`device` is ``None`` (default).
  152. .. note::
  153. See :ref:`cuda-memory-management` for more details about GPU memory
  154. management.
  155. """
  156. result = []
  157. def _recurse_add_to_result(prefix, obj):
  158. if isinstance(obj, dict):
  159. if len(prefix) > 0:
  160. prefix += "."
  161. for k, v in obj.items():
  162. _recurse_add_to_result(prefix + k, v)
  163. else:
  164. result.append((prefix, obj))
  165. stats = memory_stats_as_nested_dict(device=device)
  166. _recurse_add_to_result("", stats)
  167. result.sort()
  168. return collections.OrderedDict(result)
  169. def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> Dict[str, Any]:
  170. r"""Returns the result of :func:`~torch.cuda.memory_stats` as a nested dictionary."""
  171. if not is_initialized():
  172. return {}
  173. device = _get_device_index(device, optional=True)
  174. return torch._C._cuda_memoryStats(device)
  175. def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
  176. r"""Resets the "accumulated" (historical) stats tracked by the CUDA memory allocator.
  177. See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to
  178. the `"allocated"` and `"freed"` keys in each individual stat dict, as well as
  179. `"num_alloc_retries"` and `"num_ooms"`.
  180. Args:
  181. device (torch.device or int, optional): selected device. Returns
  182. statistic for the current device, given by :func:`~torch.cuda.current_device`,
  183. if :attr:`device` is ``None`` (default).
  184. .. note::
  185. See :ref:`cuda-memory-management` for more details about GPU memory
  186. management.
  187. """
  188. device = _get_device_index(device, optional=True)
  189. return torch._C._cuda_resetAccumulatedMemoryStats(device)
  190. def reset_peak_memory_stats(device: Union[Device, int] = None) -> None:
  191. r"""Resets the "peak" stats tracked by the CUDA memory allocator.
  192. See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the
  193. `"peak"` key in each individual stat dict.
  194. Args:
  195. device (torch.device or int, optional): selected device. Returns
  196. statistic for the current device, given by :func:`~torch.cuda.current_device`,
  197. if :attr:`device` is ``None`` (default).
  198. .. note::
  199. See :ref:`cuda-memory-management` for more details about GPU memory
  200. management.
  201. """
  202. device = _get_device_index(device, optional=True)
  203. return torch._C._cuda_resetPeakMemoryStats(device)
  204. def reset_max_memory_allocated(device: Union[Device, int] = None) -> None:
  205. r"""Resets the starting point in tracking maximum GPU memory occupied by
  206. tensors for a given device.
  207. See :func:`~torch.cuda.max_memory_allocated` for details.
  208. Args:
  209. device (torch.device or int, optional): selected device. Returns
  210. statistic for the current device, given by :func:`~torch.cuda.current_device`,
  211. if :attr:`device` is ``None`` (default).
  212. .. warning::
  213. This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
  214. /all/ peak memory stats.
  215. .. note::
  216. See :ref:`cuda-memory-management` for more details about GPU memory
  217. management.
  218. """
  219. warnings.warn(
  220. "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, "
  221. "which resets /all/ peak memory stats.",
  222. FutureWarning)
  223. return reset_peak_memory_stats(device=device)
  224. def reset_max_memory_cached(device: Union[Device, int] = None) -> None:
  225. r"""Resets the starting point in tracking maximum GPU memory managed by the
  226. caching allocator for a given device.
  227. See :func:`~torch.cuda.max_memory_cached` for details.
  228. Args:
  229. device (torch.device or int, optional): selected device. Returns
  230. statistic for the current device, given by :func:`~torch.cuda.current_device`,
  231. if :attr:`device` is ``None`` (default).
  232. .. warning::
  233. This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
  234. /all/ peak memory stats.
  235. .. note::
  236. See :ref:`cuda-memory-management` for more details about GPU memory
  237. management.
  238. """
  239. warnings.warn(
  240. "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, "
  241. "which resets /all/ peak memory stats.",
  242. FutureWarning)
  243. return reset_peak_memory_stats(device=device)
  244. def memory_allocated(device: Union[Device, int] = None) -> int:
  245. r"""Returns the current GPU memory occupied by tensors in bytes for a given
  246. device.
  247. Args:
  248. device (torch.device or int, optional): selected device. Returns
  249. statistic for the current device, given by :func:`~torch.cuda.current_device`,
  250. if :attr:`device` is ``None`` (default).
  251. .. note::
  252. This is likely less than the amount shown in `nvidia-smi` since some
  253. unused memory can be held by the caching allocator and some context
  254. needs to be created on GPU. See :ref:`cuda-memory-management` for more
  255. details about GPU memory management.
  256. """
  257. return memory_stats(device=device).get("allocated_bytes.all.current", 0)
  258. def max_memory_allocated(device: Union[Device, int] = None) -> int:
  259. r"""Returns the maximum GPU memory occupied by tensors in bytes for a given
  260. device.
  261. By default, this returns the peak allocated memory since the beginning of
  262. this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to
  263. reset the starting point in tracking this metric. For example, these two
  264. functions can measure the peak allocated memory usage of each iteration in a
  265. training loop.
  266. Args:
  267. device (torch.device or int, optional): selected device. Returns
  268. statistic for the current device, given by :func:`~torch.cuda.current_device`,
  269. if :attr:`device` is ``None`` (default).
  270. .. note::
  271. See :ref:`cuda-memory-management` for more details about GPU memory
  272. management.
  273. """
  274. return memory_stats(device=device).get("allocated_bytes.all.peak", 0)
  275. def memory_reserved(device: Union[Device, int] = None) -> int:
  276. r"""Returns the current GPU memory managed by the caching allocator in bytes
  277. for a given device.
  278. Args:
  279. device (torch.device or int, optional): selected device. Returns
  280. statistic for the current device, given by :func:`~torch.cuda.current_device`,
  281. if :attr:`device` is ``None`` (default).
  282. .. note::
  283. See :ref:`cuda-memory-management` for more details about GPU memory
  284. management.
  285. """
  286. return memory_stats(device=device).get("reserved_bytes.all.current", 0)
  287. def max_memory_reserved(device: Union[Device, int] = None) -> int:
  288. r"""Returns the maximum GPU memory managed by the caching allocator in bytes
  289. for a given device.
  290. By default, this returns the peak cached memory since the beginning of this
  291. program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset
  292. the starting point in tracking this metric. For example, these two functions
  293. can measure the peak cached memory amount of each iteration in a training
  294. loop.
  295. Args:
  296. device (torch.device or int, optional): selected device. Returns
  297. statistic for the current device, given by :func:`~torch.cuda.current_device`,
  298. if :attr:`device` is ``None`` (default).
  299. .. note::
  300. See :ref:`cuda-memory-management` for more details about GPU memory
  301. management.
  302. """
  303. return memory_stats(device=device).get("reserved_bytes.all.peak", 0)
  304. def memory_cached(device: Union[Device, int] = None) -> int:
  305. r"""Deprecated; see :func:`~torch.cuda.memory_reserved`."""
  306. warnings.warn(
  307. "torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved",
  308. FutureWarning)
  309. return memory_reserved(device=device)
  310. def max_memory_cached(device: Union[Device, int] = None) -> int:
  311. r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`."""
  312. warnings.warn(
  313. "torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved",
  314. FutureWarning)
  315. return max_memory_reserved(device=device)
  316. def memory_snapshot():
  317. r"""Returns a snapshot of the CUDA memory allocator state across all devices.
  318. Interpreting the output of this function requires familiarity with the
  319. memory allocator internals.
  320. .. note::
  321. See :ref:`cuda-memory-management` for more details about GPU memory
  322. management.
  323. """
  324. return torch._C._cuda_memorySnapshot()
  325. def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str:
  326. r"""Returns a human-readable printout of the current memory allocator
  327. statistics for a given device.
  328. This can be useful to display periodically during training, or when
  329. handling out-of-memory exceptions.
  330. Args:
  331. device (torch.device or int, optional): selected device. Returns
  332. printout for the current device, given by :func:`~torch.cuda.current_device`,
  333. if :attr:`device` is ``None`` (default).
  334. abbreviated (bool, optional): whether to return an abbreviated summary
  335. (default: False).
  336. .. note::
  337. See :ref:`cuda-memory-management` for more details about GPU memory
  338. management.
  339. """
  340. device = _get_device_index(device, optional=True)
  341. stats = memory_stats(device=device)
  342. def _format_size(sz, pref_sz):
  343. prefixes = ["B ", "KB", "MB", "GB", "TB", "PB"]
  344. prefix = prefixes[0]
  345. for new_prefix in prefixes[1:]:
  346. if pref_sz < 768 * 1024:
  347. break
  348. prefix = new_prefix
  349. sz //= 1024
  350. pref_sz /= 1024
  351. return "{:7d} {}".format(sz, prefix)
  352. def _format_count(cnt, pref_cnt):
  353. prefixes = [" ", "K", "M"]
  354. prefix = prefixes[0]
  355. for new_prefix in prefixes[1:]:
  356. if pref_cnt < 750 * 1000:
  357. break
  358. prefix = new_prefix
  359. cnt //= 1000
  360. pref_cnt /= 1000
  361. return "{:7d} {} ".format(cnt, prefix)
  362. metrics_to_display = [
  363. ("allocated_bytes", "Allocated memory", _format_size),
  364. ("active_bytes", "Active memory", _format_size),
  365. ("reserved_bytes", "GPU reserved memory", _format_size),
  366. ("inactive_split_bytes", "Non-releasable memory", _format_size),
  367. ("allocation", "Allocations", _format_count),
  368. ("active", "Active allocs", _format_count),
  369. ("segment", "GPU reserved segments", _format_count),
  370. ("inactive_split", "Non-releasable allocs", _format_count),
  371. ]
  372. lines = []
  373. lines.append("=" * 75)
  374. lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ")
  375. lines.append("-" * 75)
  376. lines.append(" {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} ")
  377. lines.append("=" * 75)
  378. lines.append(" Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed ")
  379. for metric_key, metric_name, formatter in metrics_to_display:
  380. lines.append("-" * 75)
  381. submetrics = [("all", metric_name)]
  382. if not abbreviated:
  383. submetrics.append(("large_pool", " from large pool"))
  384. submetrics.append(("small_pool", " from small pool"))
  385. current_prefval, peak_prefval, allocated_prefval, freed_prefval = None, None, None, None
  386. for submetric_key, submetric_name in submetrics:
  387. prefix = metric_key + "." + submetric_key + "."
  388. current = stats[prefix + "current"]
  389. peak = stats[prefix + "peak"]
  390. allocated = stats[prefix + "allocated"]
  391. freed = stats[prefix + "freed"]
  392. if current_prefval is None:
  393. current_prefval = current
  394. peak_prefval = peak
  395. allocated_prefval = allocated
  396. freed_prefval = freed
  397. lines.append(" {:<21} | {} | {} | {} | {} ".format(
  398. submetric_name,
  399. formatter(current, current_prefval),
  400. formatter(peak, peak_prefval),
  401. formatter(allocated, allocated_prefval),
  402. formatter(freed, freed_prefval)),
  403. )
  404. metrics_to_display = [
  405. ("oversize_allocations", "Oversize allocations", _format_count),
  406. ("oversize_segments", "Oversize GPU segments", _format_count),
  407. ]
  408. for metric_key, metric_name, formatter in metrics_to_display:
  409. lines.append("-" * 75)
  410. prefix = metric_key + "."
  411. current = stats[prefix + "current"]
  412. peak = stats[prefix + "peak"]
  413. allocated = stats[prefix + "allocated"]
  414. freed = stats[prefix + "freed"]
  415. lines.append(" {:<21} | {} | {} | {} | {} ".format(
  416. metric_name,
  417. formatter(current, current),
  418. formatter(peak, peak),
  419. formatter(allocated, allocated),
  420. formatter(freed, freed)),
  421. )
  422. lines.append("=" * 75)
  423. fmt_dict = {"_": "", "device": device}
  424. for k, v in stats.items():
  425. fmt_dict[k.replace(".", "-")] = v
  426. return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n"
  427. def list_gpu_processes(device: Union[Device, int] = None) -> str:
  428. r"""Returns a human-readable printout of the running processes
  429. and their GPU memory use for a given device.
  430. This can be useful to display periodically during training, or when
  431. handling out-of-memory exceptions.
  432. Args:
  433. device (torch.device or int, optional): selected device. Returns
  434. printout for the current device, given by :func:`~torch.cuda.current_device`,
  435. if :attr:`device` is ``None`` (default).
  436. """
  437. try:
  438. import pynvml # type: ignore[import]
  439. except ModuleNotFoundError:
  440. return("pynvml module not found, please install pynvml")
  441. from pynvml import NVMLError_DriverNotLoaded
  442. try:
  443. pynvml.nvmlInit()
  444. except NVMLError_DriverNotLoaded:
  445. return ("cuda driver can't be loaded, is cuda enabled?")
  446. device = _get_device_index(device, optional=True)
  447. handle = pynvml.nvmlDeviceGetHandleByIndex(device)
  448. procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
  449. lines = []
  450. lines.append(f"GPU:{device}")
  451. if len(procs) == 0:
  452. lines.append("no processes are running")
  453. for p in procs:
  454. mem = p.usedGpuMemory / (1024 * 1024)
  455. lines.append(f"process {p.pid:>10d} uses {mem:>12.3f} MB GPU memory")
  456. return "\n".join(lines)
  457. def mem_get_info(device: Union[Device, int] = None) -> int:
  458. r"""Returns the global free and total GPU memory occupied for a given
  459. device using cudaMemGetInfo.
  460. Args:
  461. device (torch.device or int, optional): selected device. Returns
  462. statistic for the current device, given by :func:`~torch.cuda.current_device`,
  463. if :attr:`device` is ``None`` (default).
  464. .. note::
  465. See :ref:`cuda-memory-management` for more
  466. details about GPU memory management.
  467. """
  468. if device is None:
  469. device = torch.cuda.current_device()
  470. device = _get_device_index(device)
  471. return torch.cuda.cudart().cudaMemGetInfo(device)