| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592 |
- import collections
- import contextlib
- import warnings
- from typing import Any, Dict, Union
- import torch
- from . import is_initialized, _get_device_index, _lazy_init
- from torch.types import Device
- __all__ = ["caching_allocator_alloc", "caching_allocator_delete", "set_per_process_memory_fraction",
- "empty_cache", "memory_stats", "memory_stats_as_nested_dict", "reset_accumulated_memory_stats",
- "reset_peak_memory_stats", "reset_max_memory_allocated", "reset_max_memory_cached",
- "memory_allocated", "max_memory_allocated", "memory_reserved", "max_memory_reserved",
- "memory_cached", "max_memory_cached", "memory_snapshot", "memory_summary", "list_gpu_processes",
- "mem_get_info"]
- def _host_allocator():
- _lazy_init()
- return torch._C._cuda_cudaHostAllocator()
- @contextlib.contextmanager
- def _free_mutex():
- torch._C._cuda_lock_mutex()
- try:
- yield
- finally:
- torch._C._cuda_unlock_mutex()
- def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None):
- r"""Performs a memory allocation using the CUDA memory allocator.
- Memory is allocated for a given device and a stream, this
- function is intended to be used for interoperability with other
- frameworks. Allocated memory is released through
- :func:`~torch.cuda.caching_allocator_delete`.
- Args:
- size (int): number of bytes to be allocated.
- device (torch.device or int, optional): selected device. If it is
- ``None`` the default CUDA device is used.
- stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then
- the default stream for the selected device is used.
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- if device is None:
- device = torch.cuda.current_device()
- device = _get_device_index(device)
- if stream is None:
- stream = torch.cuda.current_stream(device)
- if isinstance(stream, torch.cuda.streams.Stream):
- stream = stream.cuda_stream
- if not isinstance(stream, int):
- raise TypeError('Invalid type for stream argument, must be '
- '`torch.cuda.Stream` or `int` representing a pointer '
- 'to a exisiting stream')
- with torch.cuda.device(device):
- return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream)
- def caching_allocator_delete(mem_ptr):
- r"""Deletes memory allocated using the CUDA memory allocator.
- Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`.
- is freed here. The associated device and stream are tracked inside
- the allocator.
- Args:
- mem_ptr (int): memory address to be freed by the allocator.
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr)
- def set_per_process_memory_fraction(fraction, device: Union[Device, int] = None) -> None:
- r"""Set memory fraction for a process.
- The fraction is used to limit an caching allocator to allocated memory on a CUDA device.
- The allowed value equals the total visible memory multiplied fraction.
- If trying to allocate more than the allowed value in a process, will raise an out of
- memory error in allocator.
- Args:
- fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
- device (torch.device or int, optional): selected device. If it is
- ``None`` the default CUDA device is used.
- .. note::
- In general, the total available free memory is less than the total capacity.
- """
- _lazy_init()
- if device is None:
- device = torch.cuda.current_device()
- device = _get_device_index(device)
- if not isinstance(fraction, float):
- raise TypeError('Invalid type for fraction argument, must be `float`')
- if fraction < 0 or fraction > 1:
- raise ValueError('Invalid fraction value: {}. '
- 'Allowed range: 0~1'.format(fraction))
- torch._C._cuda_setMemoryFraction(fraction, device)
- def empty_cache() -> None:
- r"""Releases all unoccupied cached memory currently held by the caching
- allocator so that those can be used in other GPU application and visible in
- `nvidia-smi`.
- .. note::
- :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU
- memory available for PyTorch. However, it may help reduce fragmentation
- of GPU memory in certain cases. See :ref:`cuda-memory-management` for
- more details about GPU memory management.
- """
- if is_initialized():
- torch._C._cuda_emptyCache()
- def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
- r"""Returns a dictionary of CUDA memory allocator statistics for a
- given device.
- The return value of this function is a dictionary of statistics, each of
- which is a non-negative integer.
- Core statistics:
- - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- number of allocation requests received by the memory allocator.
- - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- amount of allocated memory.
- - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- number of reserved segments from ``cudaMalloc()``.
- - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- amount of reserved memory.
- - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- number of active memory blocks.
- - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- amount of active memory.
- - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- number of inactive, non-releasable memory blocks.
- - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- amount of inactive, non-releasable memory.
- For these core statistics, values are broken down as follows.
- Pool type:
- - ``all``: combined statistics across all memory pools.
- - ``large_pool``: statistics for the large allocation pool
- (as of October 2019, for size >= 1MB allocations).
- - ``small_pool``: statistics for the small allocation pool
- (as of October 2019, for size < 1MB allocations).
- Metric type:
- - ``current``: current value of this metric.
- - ``peak``: maximum value of this metric.
- - ``allocated``: historical total increase in this metric.
- - ``freed``: historical total decrease in this metric.
- In addition to the core statistics, we also provide some simple event
- counters:
- - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that
- result in a cache flush and retry.
- - ``"num_ooms"``: number of out-of-memory errors thrown.
- The caching allocator can be configured via ENV to not split blocks larger than a
- defined size (see Memory Management section of the Cuda Semantics documentation).
- This helps avoid memory framentation but may have a performance
- penalty. Additional outputs to assist with tuning and evaluating impact:
- - ``"max_split_size"``: blocks above this size will not be split.
- - ``"oversize_allocations.{current,peak,allocated,freed}"``:
- number of over-size allocation requests received by the memory allocator.
- - ``"oversize_segments.{current,peak,allocated,freed}"``:
- number of over-size reserved segments from ``cudaMalloc()``.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistics for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- result = []
- def _recurse_add_to_result(prefix, obj):
- if isinstance(obj, dict):
- if len(prefix) > 0:
- prefix += "."
- for k, v in obj.items():
- _recurse_add_to_result(prefix + k, v)
- else:
- result.append((prefix, obj))
- stats = memory_stats_as_nested_dict(device=device)
- _recurse_add_to_result("", stats)
- result.sort()
- return collections.OrderedDict(result)
- def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> Dict[str, Any]:
- r"""Returns the result of :func:`~torch.cuda.memory_stats` as a nested dictionary."""
- if not is_initialized():
- return {}
- device = _get_device_index(device, optional=True)
- return torch._C._cuda_memoryStats(device)
- def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
- r"""Resets the "accumulated" (historical) stats tracked by the CUDA memory allocator.
- See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to
- the `"allocated"` and `"freed"` keys in each individual stat dict, as well as
- `"num_alloc_retries"` and `"num_ooms"`.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- device = _get_device_index(device, optional=True)
- return torch._C._cuda_resetAccumulatedMemoryStats(device)
- def reset_peak_memory_stats(device: Union[Device, int] = None) -> None:
- r"""Resets the "peak" stats tracked by the CUDA memory allocator.
- See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the
- `"peak"` key in each individual stat dict.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- device = _get_device_index(device, optional=True)
- return torch._C._cuda_resetPeakMemoryStats(device)
- def reset_max_memory_allocated(device: Union[Device, int] = None) -> None:
- r"""Resets the starting point in tracking maximum GPU memory occupied by
- tensors for a given device.
- See :func:`~torch.cuda.max_memory_allocated` for details.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. warning::
- This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
- /all/ peak memory stats.
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- warnings.warn(
- "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, "
- "which resets /all/ peak memory stats.",
- FutureWarning)
- return reset_peak_memory_stats(device=device)
- def reset_max_memory_cached(device: Union[Device, int] = None) -> None:
- r"""Resets the starting point in tracking maximum GPU memory managed by the
- caching allocator for a given device.
- See :func:`~torch.cuda.max_memory_cached` for details.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. warning::
- This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
- /all/ peak memory stats.
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- warnings.warn(
- "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, "
- "which resets /all/ peak memory stats.",
- FutureWarning)
- return reset_peak_memory_stats(device=device)
- def memory_allocated(device: Union[Device, int] = None) -> int:
- r"""Returns the current GPU memory occupied by tensors in bytes for a given
- device.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- This is likely less than the amount shown in `nvidia-smi` since some
- unused memory can be held by the caching allocator and some context
- needs to be created on GPU. See :ref:`cuda-memory-management` for more
- details about GPU memory management.
- """
- return memory_stats(device=device).get("allocated_bytes.all.current", 0)
- def max_memory_allocated(device: Union[Device, int] = None) -> int:
- r"""Returns the maximum GPU memory occupied by tensors in bytes for a given
- device.
- By default, this returns the peak allocated memory since the beginning of
- this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to
- reset the starting point in tracking this metric. For example, these two
- functions can measure the peak allocated memory usage of each iteration in a
- training loop.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- return memory_stats(device=device).get("allocated_bytes.all.peak", 0)
- def memory_reserved(device: Union[Device, int] = None) -> int:
- r"""Returns the current GPU memory managed by the caching allocator in bytes
- for a given device.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- return memory_stats(device=device).get("reserved_bytes.all.current", 0)
- def max_memory_reserved(device: Union[Device, int] = None) -> int:
- r"""Returns the maximum GPU memory managed by the caching allocator in bytes
- for a given device.
- By default, this returns the peak cached memory since the beginning of this
- program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset
- the starting point in tracking this metric. For example, these two functions
- can measure the peak cached memory amount of each iteration in a training
- loop.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- return memory_stats(device=device).get("reserved_bytes.all.peak", 0)
- def memory_cached(device: Union[Device, int] = None) -> int:
- r"""Deprecated; see :func:`~torch.cuda.memory_reserved`."""
- warnings.warn(
- "torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved",
- FutureWarning)
- return memory_reserved(device=device)
- def max_memory_cached(device: Union[Device, int] = None) -> int:
- r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`."""
- warnings.warn(
- "torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved",
- FutureWarning)
- return max_memory_reserved(device=device)
- def memory_snapshot():
- r"""Returns a snapshot of the CUDA memory allocator state across all devices.
- Interpreting the output of this function requires familiarity with the
- memory allocator internals.
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- return torch._C._cuda_memorySnapshot()
- def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str:
- r"""Returns a human-readable printout of the current memory allocator
- statistics for a given device.
- This can be useful to display periodically during training, or when
- handling out-of-memory exceptions.
- Args:
- device (torch.device or int, optional): selected device. Returns
- printout for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- abbreviated (bool, optional): whether to return an abbreviated summary
- (default: False).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- device = _get_device_index(device, optional=True)
- stats = memory_stats(device=device)
- def _format_size(sz, pref_sz):
- prefixes = ["B ", "KB", "MB", "GB", "TB", "PB"]
- prefix = prefixes[0]
- for new_prefix in prefixes[1:]:
- if pref_sz < 768 * 1024:
- break
- prefix = new_prefix
- sz //= 1024
- pref_sz /= 1024
- return "{:7d} {}".format(sz, prefix)
- def _format_count(cnt, pref_cnt):
- prefixes = [" ", "K", "M"]
- prefix = prefixes[0]
- for new_prefix in prefixes[1:]:
- if pref_cnt < 750 * 1000:
- break
- prefix = new_prefix
- cnt //= 1000
- pref_cnt /= 1000
- return "{:7d} {} ".format(cnt, prefix)
- metrics_to_display = [
- ("allocated_bytes", "Allocated memory", _format_size),
- ("active_bytes", "Active memory", _format_size),
- ("reserved_bytes", "GPU reserved memory", _format_size),
- ("inactive_split_bytes", "Non-releasable memory", _format_size),
- ("allocation", "Allocations", _format_count),
- ("active", "Active allocs", _format_count),
- ("segment", "GPU reserved segments", _format_count),
- ("inactive_split", "Non-releasable allocs", _format_count),
- ]
- lines = []
- lines.append("=" * 75)
- lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ")
- lines.append("-" * 75)
- lines.append(" {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} ")
- lines.append("=" * 75)
- lines.append(" Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed ")
- for metric_key, metric_name, formatter in metrics_to_display:
- lines.append("-" * 75)
- submetrics = [("all", metric_name)]
- if not abbreviated:
- submetrics.append(("large_pool", " from large pool"))
- submetrics.append(("small_pool", " from small pool"))
- current_prefval, peak_prefval, allocated_prefval, freed_prefval = None, None, None, None
- for submetric_key, submetric_name in submetrics:
- prefix = metric_key + "." + submetric_key + "."
- current = stats[prefix + "current"]
- peak = stats[prefix + "peak"]
- allocated = stats[prefix + "allocated"]
- freed = stats[prefix + "freed"]
- if current_prefval is None:
- current_prefval = current
- peak_prefval = peak
- allocated_prefval = allocated
- freed_prefval = freed
- lines.append(" {:<21} | {} | {} | {} | {} ".format(
- submetric_name,
- formatter(current, current_prefval),
- formatter(peak, peak_prefval),
- formatter(allocated, allocated_prefval),
- formatter(freed, freed_prefval)),
- )
- metrics_to_display = [
- ("oversize_allocations", "Oversize allocations", _format_count),
- ("oversize_segments", "Oversize GPU segments", _format_count),
- ]
- for metric_key, metric_name, formatter in metrics_to_display:
- lines.append("-" * 75)
- prefix = metric_key + "."
- current = stats[prefix + "current"]
- peak = stats[prefix + "peak"]
- allocated = stats[prefix + "allocated"]
- freed = stats[prefix + "freed"]
- lines.append(" {:<21} | {} | {} | {} | {} ".format(
- metric_name,
- formatter(current, current),
- formatter(peak, peak),
- formatter(allocated, allocated),
- formatter(freed, freed)),
- )
- lines.append("=" * 75)
- fmt_dict = {"_": "", "device": device}
- for k, v in stats.items():
- fmt_dict[k.replace(".", "-")] = v
- return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n"
- def list_gpu_processes(device: Union[Device, int] = None) -> str:
- r"""Returns a human-readable printout of the running processes
- and their GPU memory use for a given device.
- This can be useful to display periodically during training, or when
- handling out-of-memory exceptions.
- Args:
- device (torch.device or int, optional): selected device. Returns
- printout for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- """
- try:
- import pynvml # type: ignore[import]
- except ModuleNotFoundError:
- return("pynvml module not found, please install pynvml")
- from pynvml import NVMLError_DriverNotLoaded
- try:
- pynvml.nvmlInit()
- except NVMLError_DriverNotLoaded:
- return ("cuda driver can't be loaded, is cuda enabled?")
- device = _get_device_index(device, optional=True)
- handle = pynvml.nvmlDeviceGetHandleByIndex(device)
- procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
- lines = []
- lines.append(f"GPU:{device}")
- if len(procs) == 0:
- lines.append("no processes are running")
- for p in procs:
- mem = p.usedGpuMemory / (1024 * 1024)
- lines.append(f"process {p.pid:>10d} uses {mem:>12.3f} MB GPU memory")
- return "\n".join(lines)
- def mem_get_info(device: Union[Device, int] = None) -> int:
- r"""Returns the global free and total GPU memory occupied for a given
- device using cudaMemGetInfo.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more
- details about GPU memory management.
- """
- if device is None:
- device = torch.cuda.current_device()
- device = _get_device_index(device)
- return torch.cuda.cudart().cudaMemGetInfo(device)
|