nvml.h 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. /*
  2. * Copyright © 2012-2023 Inria. All rights reserved.
  3. * See COPYING in top-level directory.
  4. */
  5. /** \file
  6. * \brief Macros to help interaction between hwloc and the NVIDIA Management Library.
  7. *
  8. * Applications that use both hwloc and the NVIDIA Management Library may want to
  9. * include this file so as to get topology information for NVML devices.
  10. */
  11. #ifndef HWLOC_NVML_H
  12. #define HWLOC_NVML_H
  13. #include "hwloc.h"
  14. #include "hwloc/autogen/config.h"
  15. #include "hwloc/helper.h"
  16. #ifdef HWLOC_LINUX_SYS
  17. #include "hwloc/linux.h"
  18. #endif
  19. #include <nvml.h>
  20. #ifdef __cplusplus
  21. extern "C" {
  22. #endif
  23. /** \defgroup hwlocality_nvml Interoperability with the NVIDIA Management Library
  24. *
  25. * This interface offers ways to retrieve topology information about
  26. * devices managed by the NVIDIA Management Library (NVML).
  27. *
  28. * @{
  29. */
  30. /** \brief Get the CPU set of processors that are physically
  31. * close to NVML device \p device.
  32. *
  33. * Store in \p set the CPU-set describing the locality of the NVML device \p device.
  34. *
  35. * Topology \p topology and device \p device must match the local machine.
  36. * I/O devices detection and the NVML component are not needed in the topology.
  37. *
  38. * The function only returns the locality of the device.
  39. * If more information about the device is needed, OS objects should
  40. * be used instead, see hwloc_nvml_get_device_osdev()
  41. * and hwloc_nvml_get_device_osdev_by_index().
  42. *
  43. * This function is currently only implemented in a meaningful way for
  44. * Linux; other systems will simply get a full cpuset.
  45. *
  46. * \return 0 on success.
  47. * \return -1 on error, for instance if device information could not be found.
  48. */
  49. static __hwloc_inline int
  50. hwloc_nvml_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
  51. nvmlDevice_t device, hwloc_cpuset_t set)
  52. {
  53. #ifdef HWLOC_LINUX_SYS
  54. /* If we're on Linux, use the sysfs mechanism to get the local cpus */
  55. #define HWLOC_NVML_DEVICE_SYSFS_PATH_MAX 128
  56. char path[HWLOC_NVML_DEVICE_SYSFS_PATH_MAX];
  57. nvmlReturn_t nvres;
  58. nvmlPciInfo_t pci;
  59. if (!hwloc_topology_is_thissystem(topology)) {
  60. errno = EINVAL;
  61. return -1;
  62. }
  63. nvres = nvmlDeviceGetPciInfo(device, &pci);
  64. if (NVML_SUCCESS != nvres) {
  65. errno = EINVAL;
  66. return -1;
  67. }
  68. sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", pci.domain, pci.bus, pci.device);
  69. if (hwloc_linux_read_path_as_cpumask(path, set) < 0
  70. || hwloc_bitmap_iszero(set))
  71. hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
  72. #else
  73. /* Non-Linux systems simply get a full cpuset */
  74. hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
  75. #endif
  76. return 0;
  77. }
  78. /** \brief Get the hwloc OS device object corresponding to the
  79. * NVML device whose index is \p idx.
  80. *
  81. * \return The hwloc OS device object describing the NVML device whose index is \p idx.
  82. * \return \c NULL if none could be found.
  83. *
  84. * The topology \p topology does not necessarily have to match the current
  85. * machine. For instance the topology may be an XML import of a remote host.
  86. * I/O devices detection and the NVML component must be enabled in the topology.
  87. *
  88. * \note The corresponding PCI device object can be obtained by looking
  89. * at the OS device parent object (unless PCI devices are filtered out).
  90. */
  91. static __hwloc_inline hwloc_obj_t
  92. hwloc_nvml_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
  93. {
  94. hwloc_obj_t osdev = NULL;
  95. while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
  96. if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
  97. && osdev->name
  98. && !strncmp("nvml", osdev->name, 4)
  99. && atoi(osdev->name + 4) == (int) idx)
  100. return osdev;
  101. }
  102. return NULL;
  103. }
  104. /** \brief Get the hwloc OS device object corresponding to NVML device \p device.
  105. *
  106. * \return The hwloc OS device object that describes the given NVML device \p device.
  107. * \return \c NULL if none could be found.
  108. *
  109. * Topology \p topology and device \p device must match the local machine.
  110. * I/O devices detection and the NVML component must be enabled in the topology.
  111. * If not, the locality of the object may still be found using
  112. * hwloc_nvml_get_device_cpuset().
  113. *
  114. * \note The corresponding hwloc PCI device may be found by looking
  115. * at the result parent pointer (unless PCI devices are filtered out).
  116. */
  117. static __hwloc_inline hwloc_obj_t
  118. hwloc_nvml_get_device_osdev(hwloc_topology_t topology, nvmlDevice_t device)
  119. {
  120. hwloc_obj_t osdev;
  121. nvmlReturn_t nvres;
  122. nvmlPciInfo_t pci;
  123. char uuid[64];
  124. if (!hwloc_topology_is_thissystem(topology)) {
  125. errno = EINVAL;
  126. return NULL;
  127. }
  128. nvres = nvmlDeviceGetPciInfo(device, &pci);
  129. if (NVML_SUCCESS != nvres)
  130. return NULL;
  131. nvres = nvmlDeviceGetUUID(device, uuid, sizeof(uuid));
  132. if (NVML_SUCCESS != nvres)
  133. uuid[0] = '\0';
  134. osdev = NULL;
  135. while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
  136. hwloc_obj_t pcidev = osdev->parent;
  137. const char *info;
  138. if (strncmp(osdev->name, "nvml", 4))
  139. continue;
  140. if (pcidev
  141. && pcidev->type == HWLOC_OBJ_PCI_DEVICE
  142. && pcidev->attr->pcidev.domain == pci.domain
  143. && pcidev->attr->pcidev.bus == pci.bus
  144. && pcidev->attr->pcidev.dev == pci.device
  145. && pcidev->attr->pcidev.func == 0)
  146. return osdev;
  147. info = hwloc_obj_get_info_by_name(osdev, "NVIDIAUUID");
  148. if (info && !strcmp(info, uuid))
  149. return osdev;
  150. }
  151. return NULL;
  152. }
  153. /** @} */
  154. #ifdef __cplusplus
  155. } /* extern "C" */
  156. #endif
  157. #endif /* HWLOC_NVML_H */