2023-04-21 11:10:14 -07:00
|
|
|
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
2025-07-16 23:14:36 +00:00
|
|
|
#include <c10/cuda/CUDAException.h>
|
2023-04-16 19:05:07 -07:00
|
|
|
#include <c10/cuda/driver_api.h>
|
fix missing nvml in c10/cuda/driver_api.cpp issue (#112121)
Since https://github.com/pytorch/pytorch/pull/99699 introduced a dependency on nvml for oom reporting in `c10/cuda/driver_api.h`, `c10/cuda/driver_api.cpp`, and `reportProcessMemoryInfo` from `c10/cuda/CUDACachingAllocator.cpp`, we've seen failures regarding cuda expandable segments and oom reporting in NVIDIA's internal CI, specifically on Jetson devices which don't have nvml support as it is incompatible with Jetson. Example failures using the latest upstream on Orin AGX node:
`python test/test_cuda.py -k test_notifies_oom` generates
```
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1643, in _worker
results[t] = torch.nn.functional.conv2d(results[t], weight, padding=0)
RuntimeError: CUDA driver error: out of memory
```
`python test/test_cuda_expandable_segments.py` generates
```
Traceback (most recent call last):
File "/opt/pytorch/pytorch/test/test_cuda_expandable_segments.py", line 12, in <module>
exec(compile(open(filepath).read(), filepath, mode='exec'))
File "/opt/pytorch/pytorch/test/test_cuda.py", line 66, in <module>
class TestCuda(TestCase):
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1609, in TestCuda
@unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_utils.py", line 4628, in wrapped
self._value = self._cb()
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_cuda.py", line 20, in <lambda>
TEST_CUDNN = LazyVal(lambda: TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)))
RuntimeError: handle_0 INTERNAL ASSERT FAILED at "/opt/pytorch/pytorch/c10/cuda/driver_api.cpp":15, please report a bug to PyTorch.
```
This PR intends to fix this issue by adding various dlopen checks to make sure nvml actually exists, and safely fall back to using the older libcuda based features of cuda expandable segments and oom reporting if nvml is not found.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112121
Approved by: https://github.com/eqy, https://github.com/ngimel, https://github.com/albanD
2023-11-02 21:28:02 +00:00
|
|
|
#include <c10/util/CallOnce.h>
|
2023-04-16 19:05:07 -07:00
|
|
|
#include <c10/util/Exception.h>
|
2025-07-16 23:14:36 +00:00
|
|
|
#include <c10/util/Logging.h>
|
|
|
|
|
#include <cuda_runtime.h>
|
2023-04-16 19:05:07 -07:00
|
|
|
#include <dlfcn.h>
|
2024-01-31 00:32:35 +00:00
|
|
|
|
|
|
|
|
namespace c10::cuda {
|
2023-04-16 19:05:07 -07:00
|
|
|
|
|
|
|
|
namespace {
|
2023-04-21 11:10:14 -07:00
|
|
|
|
2025-07-16 23:14:36 +00:00
|
|
|
void* get_symbol(const char* name, int version);
|
|
|
|
|
|
fix missing nvml in c10/cuda/driver_api.cpp issue (#112121)
Since https://github.com/pytorch/pytorch/pull/99699 introduced a dependency on nvml for oom reporting in `c10/cuda/driver_api.h`, `c10/cuda/driver_api.cpp`, and `reportProcessMemoryInfo` from `c10/cuda/CUDACachingAllocator.cpp`, we've seen failures regarding cuda expandable segments and oom reporting in NVIDIA's internal CI, specifically on Jetson devices which don't have nvml support as it is incompatible with Jetson. Example failures using the latest upstream on Orin AGX node:
`python test/test_cuda.py -k test_notifies_oom` generates
```
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1643, in _worker
results[t] = torch.nn.functional.conv2d(results[t], weight, padding=0)
RuntimeError: CUDA driver error: out of memory
```
`python test/test_cuda_expandable_segments.py` generates
```
Traceback (most recent call last):
File "/opt/pytorch/pytorch/test/test_cuda_expandable_segments.py", line 12, in <module>
exec(compile(open(filepath).read(), filepath, mode='exec'))
File "/opt/pytorch/pytorch/test/test_cuda.py", line 66, in <module>
class TestCuda(TestCase):
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1609, in TestCuda
@unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_utils.py", line 4628, in wrapped
self._value = self._cb()
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_cuda.py", line 20, in <lambda>
TEST_CUDNN = LazyVal(lambda: TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)))
RuntimeError: handle_0 INTERNAL ASSERT FAILED at "/opt/pytorch/pytorch/c10/cuda/driver_api.cpp":15, please report a bug to PyTorch.
```
This PR intends to fix this issue by adding various dlopen checks to make sure nvml actually exists, and safely fall back to using the older libcuda based features of cuda expandable segments and oom reporting if nvml is not found.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112121
Approved by: https://github.com/eqy, https://github.com/ngimel, https://github.com/albanD
2023-11-02 21:28:02 +00:00
|
|
|
DriverAPI create_driver_api() {
|
|
|
|
|
void* handle_1 = DriverAPI::get_nvml_handle();
|
2023-06-04 06:33:01 +00:00
|
|
|
DriverAPI r{};
|
2023-04-16 19:05:07 -07:00
|
|
|
|
2025-07-16 23:14:36 +00:00
|
|
|
#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED(name, version) \
|
|
|
|
|
r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version)); \
|
|
|
|
|
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name);
|
|
|
|
|
C10_LIBCUDA_DRIVER_API_REQUIRED(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED)
|
|
|
|
|
#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED
|
2025-07-11 20:36:34 +00:00
|
|
|
|
2025-07-16 23:14:36 +00:00
|
|
|
// Users running drivers between 12.0 and 12.3 will not have these symbols,
|
|
|
|
|
// they would be resolved into nullptr, but we guard their usage at runtime
|
|
|
|
|
// to ensure safe fallback behavior.
|
|
|
|
|
#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL(name, version) \
|
|
|
|
|
r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version));
|
|
|
|
|
C10_LIBCUDA_DRIVER_API_OPTIONAL(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL)
|
|
|
|
|
#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL
|
2024-08-22 21:02:50 -07:00
|
|
|
|
fix missing nvml in c10/cuda/driver_api.cpp issue (#112121)
Since https://github.com/pytorch/pytorch/pull/99699 introduced a dependency on nvml for oom reporting in `c10/cuda/driver_api.h`, `c10/cuda/driver_api.cpp`, and `reportProcessMemoryInfo` from `c10/cuda/CUDACachingAllocator.cpp`, we've seen failures regarding cuda expandable segments and oom reporting in NVIDIA's internal CI, specifically on Jetson devices which don't have nvml support as it is incompatible with Jetson. Example failures using the latest upstream on Orin AGX node:
`python test/test_cuda.py -k test_notifies_oom` generates
```
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1643, in _worker
results[t] = torch.nn.functional.conv2d(results[t], weight, padding=0)
RuntimeError: CUDA driver error: out of memory
```
`python test/test_cuda_expandable_segments.py` generates
```
Traceback (most recent call last):
File "/opt/pytorch/pytorch/test/test_cuda_expandable_segments.py", line 12, in <module>
exec(compile(open(filepath).read(), filepath, mode='exec'))
File "/opt/pytorch/pytorch/test/test_cuda.py", line 66, in <module>
class TestCuda(TestCase):
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1609, in TestCuda
@unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_utils.py", line 4628, in wrapped
self._value = self._cb()
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_cuda.py", line 20, in <lambda>
TEST_CUDNN = LazyVal(lambda: TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)))
RuntimeError: handle_0 INTERNAL ASSERT FAILED at "/opt/pytorch/pytorch/c10/cuda/driver_api.cpp":15, please report a bug to PyTorch.
```
This PR intends to fix this issue by adding various dlopen checks to make sure nvml actually exists, and safely fall back to using the older libcuda based features of cuda expandable segments and oom reporting if nvml is not found.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112121
Approved by: https://github.com/eqy, https://github.com/ngimel, https://github.com/albanD
2023-11-02 21:28:02 +00:00
|
|
|
if (handle_1) {
|
|
|
|
|
#define LOOKUP_NVML_ENTRY(name) \
|
|
|
|
|
r.name##_ = ((decltype(&name))dlsym(handle_1, #name)); \
|
2023-11-05 11:47:36 -08:00
|
|
|
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
|
fix missing nvml in c10/cuda/driver_api.cpp issue (#112121)
Since https://github.com/pytorch/pytorch/pull/99699 introduced a dependency on nvml for oom reporting in `c10/cuda/driver_api.h`, `c10/cuda/driver_api.cpp`, and `reportProcessMemoryInfo` from `c10/cuda/CUDACachingAllocator.cpp`, we've seen failures regarding cuda expandable segments and oom reporting in NVIDIA's internal CI, specifically on Jetson devices which don't have nvml support as it is incompatible with Jetson. Example failures using the latest upstream on Orin AGX node:
`python test/test_cuda.py -k test_notifies_oom` generates
```
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1643, in _worker
results[t] = torch.nn.functional.conv2d(results[t], weight, padding=0)
RuntimeError: CUDA driver error: out of memory
```
`python test/test_cuda_expandable_segments.py` generates
```
Traceback (most recent call last):
File "/opt/pytorch/pytorch/test/test_cuda_expandable_segments.py", line 12, in <module>
exec(compile(open(filepath).read(), filepath, mode='exec'))
File "/opt/pytorch/pytorch/test/test_cuda.py", line 66, in <module>
class TestCuda(TestCase):
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1609, in TestCuda
@unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_utils.py", line 4628, in wrapped
self._value = self._cb()
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_cuda.py", line 20, in <lambda>
TEST_CUDNN = LazyVal(lambda: TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)))
RuntimeError: handle_0 INTERNAL ASSERT FAILED at "/opt/pytorch/pytorch/c10/cuda/driver_api.cpp":15, please report a bug to PyTorch.
```
This PR intends to fix this issue by adding various dlopen checks to make sure nvml actually exists, and safely fall back to using the older libcuda based features of cuda expandable segments and oom reporting if nvml is not found.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112121
Approved by: https://github.com/eqy, https://github.com/ngimel, https://github.com/albanD
2023-11-02 21:28:02 +00:00
|
|
|
C10_NVML_DRIVER_API(LOOKUP_NVML_ENTRY)
|
|
|
|
|
#undef LOOKUP_NVML_ENTRY
|
|
|
|
|
}
|
2023-04-16 19:05:07 -07:00
|
|
|
return r;
|
|
|
|
|
}
|
2025-07-16 23:14:36 +00:00
|
|
|
|
|
|
|
|
void* get_symbol(const char* name, int version) {
|
|
|
|
|
void* out = nullptr;
|
|
|
|
|
cudaDriverEntryPointQueryResult qres{};
|
|
|
|
|
|
|
|
|
|
// CUDA 12.5+ supports version-based lookup
|
|
|
|
|
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12050)
|
|
|
|
|
if (auto st = cudaGetDriverEntryPointByVersion(
|
|
|
|
|
name, &out, version, cudaEnableDefault, &qres);
|
|
|
|
|
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
|
|
|
|
|
return out;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// This fallback to the old API to try getting the symbol again.
|
|
|
|
|
if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
|
|
|
|
|
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
|
|
|
|
|
return out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// If the symbol cannot be resolved, report and return nullptr;
|
|
|
|
|
// the caller is responsible for checking the pointer.
|
|
|
|
|
LOG(INFO) << "Failed to resolve symbol " << name;
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-16 19:05:07 -07:00
|
|
|
} // namespace
|
|
|
|
|
|
fix missing nvml in c10/cuda/driver_api.cpp issue (#112121)
Since https://github.com/pytorch/pytorch/pull/99699 introduced a dependency on nvml for oom reporting in `c10/cuda/driver_api.h`, `c10/cuda/driver_api.cpp`, and `reportProcessMemoryInfo` from `c10/cuda/CUDACachingAllocator.cpp`, we've seen failures regarding cuda expandable segments and oom reporting in NVIDIA's internal CI, specifically on Jetson devices which don't have nvml support as it is incompatible with Jetson. Example failures using the latest upstream on Orin AGX node:
`python test/test_cuda.py -k test_notifies_oom` generates
```
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1643, in _worker
results[t] = torch.nn.functional.conv2d(results[t], weight, padding=0)
RuntimeError: CUDA driver error: out of memory
```
`python test/test_cuda_expandable_segments.py` generates
```
Traceback (most recent call last):
File "/opt/pytorch/pytorch/test/test_cuda_expandable_segments.py", line 12, in <module>
exec(compile(open(filepath).read(), filepath, mode='exec'))
File "/opt/pytorch/pytorch/test/test_cuda.py", line 66, in <module>
class TestCuda(TestCase):
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1609, in TestCuda
@unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_utils.py", line 4628, in wrapped
self._value = self._cb()
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_cuda.py", line 20, in <lambda>
TEST_CUDNN = LazyVal(lambda: TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)))
RuntimeError: handle_0 INTERNAL ASSERT FAILED at "/opt/pytorch/pytorch/c10/cuda/driver_api.cpp":15, please report a bug to PyTorch.
```
This PR intends to fix this issue by adding various dlopen checks to make sure nvml actually exists, and safely fall back to using the older libcuda based features of cuda expandable segments and oom reporting if nvml is not found.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112121
Approved by: https://github.com/eqy, https://github.com/ngimel, https://github.com/albanD
2023-11-02 21:28:02 +00:00
|
|
|
void* DriverAPI::get_nvml_handle() {
|
2023-11-05 11:47:30 -08:00
|
|
|
static void* nvml_hanle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
|
|
|
|
|
return nvml_hanle;
|
fix missing nvml in c10/cuda/driver_api.cpp issue (#112121)
Since https://github.com/pytorch/pytorch/pull/99699 introduced a dependency on nvml for oom reporting in `c10/cuda/driver_api.h`, `c10/cuda/driver_api.cpp`, and `reportProcessMemoryInfo` from `c10/cuda/CUDACachingAllocator.cpp`, we've seen failures regarding cuda expandable segments and oom reporting in NVIDIA's internal CI, specifically on Jetson devices which don't have nvml support as it is incompatible with Jetson. Example failures using the latest upstream on Orin AGX node:
`python test/test_cuda.py -k test_notifies_oom` generates
```
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1643, in _worker
results[t] = torch.nn.functional.conv2d(results[t], weight, padding=0)
RuntimeError: CUDA driver error: out of memory
```
`python test/test_cuda_expandable_segments.py` generates
```
Traceback (most recent call last):
File "/opt/pytorch/pytorch/test/test_cuda_expandable_segments.py", line 12, in <module>
exec(compile(open(filepath).read(), filepath, mode='exec'))
File "/opt/pytorch/pytorch/test/test_cuda.py", line 66, in <module>
class TestCuda(TestCase):
File "/opt/pytorch/pytorch/test/test_cuda.py", line 1609, in TestCuda
@unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_utils.py", line 4628, in wrapped
self._value = self._cb()
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_cuda.py", line 20, in <lambda>
TEST_CUDNN = LazyVal(lambda: TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)))
RuntimeError: handle_0 INTERNAL ASSERT FAILED at "/opt/pytorch/pytorch/c10/cuda/driver_api.cpp":15, please report a bug to PyTorch.
```
This PR intends to fix this issue by adding various dlopen checks to make sure nvml actually exists, and safely fall back to using the older libcuda based features of cuda expandable segments and oom reporting if nvml is not found.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112121
Approved by: https://github.com/eqy, https://github.com/ngimel, https://github.com/albanD
2023-11-02 21:28:02 +00:00
|
|
|
}
|
|
|
|
|
|
2023-12-19 20:16:42 -08:00
|
|
|
C10_EXPORT DriverAPI* DriverAPI::get() {
|
2023-04-16 19:05:07 -07:00
|
|
|
static DriverAPI singleton = create_driver_api();
|
|
|
|
|
return &singleton;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-31 00:32:35 +00:00
|
|
|
} // namespace c10::cuda
|
2023-04-16 19:05:07 -07:00
|
|
|
|
|
|
|
|
#endif
|