2020-02-05 15:26:06 -08:00
|
|
|
#include <chrono>
|
2023-11-15 20:45:14 -08:00
|
|
|
#include <filesystem>
|
|
|
|
|
#include <fstream>
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
#include <thread>
|
2020-02-05 15:26:06 -08:00
|
|
|
|
2021-06-09 14:42:47 -07:00
|
|
|
#include <c10/util/irange.h>
|
2019-10-07 17:37:58 -07:00
|
|
|
#include <torch/csrc/cuda/nccl.h>
|
2022-09-30 05:13:48 +00:00
|
|
|
#include <torch/csrc/distributed/c10d/FileStore.hpp>
|
2024-11-26 23:55:26 -08:00
|
|
|
#include <torch/csrc/distributed/c10d/FlightRecorder.hpp>
|
2023-11-15 20:45:14 -08:00
|
|
|
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
2022-09-30 05:13:48 +00:00
|
|
|
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
|
2024-10-10 18:05:32 +00:00
|
|
|
#include <utility>
|
2021-06-24 12:37:29 -07:00
|
|
|
#include "CUDATest.hpp"
|
|
|
|
|
#include "TestUtils.hpp"
|
2019-08-22 16:10:29 -07:00
|
|
|
|
2020-02-05 15:26:06 -08:00
|
|
|
#include <gtest/gtest.h>
|
|
|
|
|
|
2019-08-22 16:10:29 -07:00
|
|
|
using namespace c10d::test;
|
|
|
|
|
|
2019-10-07 17:37:58 -07:00
|
|
|
constexpr int kNcclErrorHandlingVersion = 2400;
|
|
|
|
|
|
2019-08-22 16:10:29 -07:00
|
|
|
class WorkNCCLSimulateErrors : public c10d::ProcessGroupNCCL::WorkNCCL {
|
|
|
|
|
public:
|
|
|
|
|
WorkNCCLSimulateErrors(
|
2024-02-12 18:45:45 +00:00
|
|
|
at::Device& device,
|
2020-10-09 09:44:49 -07:00
|
|
|
bool simulate_error,
|
|
|
|
|
int rank,
|
2021-11-17 20:25:26 -08:00
|
|
|
c10d::OpType opType,
|
2024-10-10 04:24:03 +00:00
|
|
|
uint64_t seq,
|
|
|
|
|
bool isP2P)
|
|
|
|
|
: WorkNCCL("0", "default_pg", device, rank, opType, seq, isP2P),
|
2024-08-24 01:04:43 +00:00
|
|
|
simulateError_(simulate_error) {}
|
2019-08-22 16:10:29 -07:00
|
|
|
|
2024-02-12 18:45:45 +00:00
|
|
|
std::exception_ptr checkForNCCLErrors() override {
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
if (simulateError_) {
|
2019-08-22 16:10:29 -07:00
|
|
|
return std::make_exception_ptr(std::runtime_error("Error"));
|
|
|
|
|
}
|
2024-02-12 18:45:45 +00:00
|
|
|
return c10d::ProcessGroupNCCL::WorkNCCL::checkForNCCLErrors();
|
2019-08-22 16:10:29 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
bool simulateError_;
|
2019-08-22 16:10:29 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class ProcessGroupNCCLSimulateErrors : public c10d::ProcessGroupNCCL {
|
|
|
|
|
public:
|
|
|
|
|
ProcessGroupNCCLSimulateErrors(
|
2020-11-11 22:49:06 -08:00
|
|
|
const c10::intrusive_ptr<c10d::Store>& store,
|
2019-08-22 16:10:29 -07:00
|
|
|
int rank,
|
2020-02-05 15:26:06 -08:00
|
|
|
int size,
|
2020-11-12 07:34:13 -08:00
|
|
|
c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> opts)
|
2024-10-10 18:05:32 +00:00
|
|
|
: ProcessGroupNCCL(store, rank, size, std::move(opts)) {}
|
2019-08-22 16:10:29 -07:00
|
|
|
|
|
|
|
|
std::exception_ptr checkForNCCLErrors(
|
2024-02-12 18:45:45 +00:00
|
|
|
std::shared_ptr<c10d::NCCLComm>& ncclComm) override {
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
if (simulateError_) {
|
2019-08-22 16:10:29 -07:00
|
|
|
return std::make_exception_ptr(std::runtime_error("Error"));
|
|
|
|
|
}
|
2024-02-12 18:45:45 +00:00
|
|
|
return c10d::ProcessGroupNCCL::checkForNCCLErrors(ncclComm);
|
2019-08-22 16:10:29 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::chrono::duration<int64_t, std::milli> getWatchdogSleepInterval() {
|
|
|
|
|
return std::chrono::milliseconds(
|
|
|
|
|
ProcessGroupNCCLSimulateErrors::kWatchdogThreadSleepMillis);
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-11 22:49:06 -08:00
|
|
|
c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
|
2024-02-12 18:45:45 +00:00
|
|
|
at::Device& device,
|
2020-10-09 09:44:49 -07:00
|
|
|
int rank,
|
2020-11-06 09:48:28 -08:00
|
|
|
c10d::OpType opType,
|
2024-10-10 04:24:03 +00:00
|
|
|
bool isP2P,
|
2021-02-11 12:40:37 -08:00
|
|
|
const char* profilingTitle,
|
2023-10-23 18:40:26 -07:00
|
|
|
const std::vector<at::Tensor>& inputs = {},
|
2024-02-28 13:04:20 -08:00
|
|
|
const std::vector<at::Tensor>& outputs = {},
|
|
|
|
|
bool record = false) override {
|
2020-11-11 22:49:06 -08:00
|
|
|
return c10::make_intrusive<WorkNCCLSimulateErrors>(
|
2024-10-10 04:24:03 +00:00
|
|
|
device,
|
|
|
|
|
simulateError_,
|
|
|
|
|
rank,
|
|
|
|
|
opType,
|
|
|
|
|
isP2P ? seqP2P_ : seqCollective_,
|
|
|
|
|
isP2P);
|
2019-08-22 16:10:29 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t getNCCLCommCacheSize() {
|
|
|
|
|
return devNCCLCommMap_.size();
|
|
|
|
|
}
|
|
|
|
|
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
void simulateError() {
|
|
|
|
|
simulateError_ = true;
|
2019-08-22 16:10:29 -07:00
|
|
|
}
|
|
|
|
|
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
void resetError() {
|
|
|
|
|
simulateError_ = false;
|
2019-08-22 16:10:29 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
2024-10-10 18:05:32 +00:00
|
|
|
bool simulateError_{false};
|
2019-08-22 16:10:29 -07:00
|
|
|
};
|
|
|
|
|
|
2019-12-13 00:31:16 -08:00
|
|
|
class WorkNCCLTimedoutErrors : public c10d::ProcessGroupNCCL::WorkNCCL {
|
|
|
|
|
public:
|
|
|
|
|
WorkNCCLTimedoutErrors(
|
2024-02-12 18:45:45 +00:00
|
|
|
at::Device& device,
|
2020-10-09 09:44:49 -07:00
|
|
|
bool set_timedout_error,
|
|
|
|
|
int rank,
|
2021-11-17 20:25:26 -08:00
|
|
|
c10d::OpType opType,
|
2024-10-10 04:24:03 +00:00
|
|
|
uint64_t seq,
|
|
|
|
|
bool isP2P)
|
|
|
|
|
: WorkNCCL("0", "default_pg", device, rank, opType, seq, isP2P),
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
setTimedoutError_(set_timedout_error) {}
|
2019-12-13 00:31:16 -08:00
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
bool isCompleted() override {
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
if (setTimedoutError_) {
|
2019-12-13 00:31:16 -08:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
return c10d::ProcessGroupNCCL::WorkNCCL::isCompleted();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
bool setTimedoutError_;
|
2019-12-13 00:31:16 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class ProcessGroupNCCLTimedOutErrors : public ProcessGroupNCCLSimulateErrors {
|
|
|
|
|
public:
|
|
|
|
|
ProcessGroupNCCLTimedOutErrors(
|
2020-11-11 22:49:06 -08:00
|
|
|
const c10::intrusive_ptr<c10d::Store>& store,
|
2019-12-13 00:31:16 -08:00
|
|
|
int rank,
|
2020-02-05 15:26:06 -08:00
|
|
|
int size,
|
2020-11-12 07:34:13 -08:00
|
|
|
c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> opts)
|
2024-10-10 18:05:32 +00:00
|
|
|
: ProcessGroupNCCLSimulateErrors(store, rank, size, std::move(opts)) {}
|
2019-12-13 00:31:16 -08:00
|
|
|
|
2020-11-11 22:49:06 -08:00
|
|
|
c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
|
2024-02-12 18:45:45 +00:00
|
|
|
at::Device& device,
|
2020-10-09 09:44:49 -07:00
|
|
|
int rank,
|
2020-11-06 09:48:28 -08:00
|
|
|
c10d::OpType opType,
|
2024-10-10 04:24:03 +00:00
|
|
|
bool isP2P,
|
2021-02-11 12:40:37 -08:00
|
|
|
const char* profilingTitle,
|
2023-10-23 18:40:26 -07:00
|
|
|
const std::vector<at::Tensor>& inputs = {},
|
2024-02-28 13:04:20 -08:00
|
|
|
const std::vector<at::Tensor>& outputs = {},
|
|
|
|
|
bool record = false) override {
|
2020-11-11 22:49:06 -08:00
|
|
|
return c10::make_intrusive<WorkNCCLTimedoutErrors>(
|
2024-10-10 04:24:03 +00:00
|
|
|
device,
|
|
|
|
|
setTimedoutError_,
|
|
|
|
|
rank,
|
|
|
|
|
opType,
|
|
|
|
|
isP2P ? seqP2P_ : seqCollective_,
|
|
|
|
|
isP2P);
|
2019-12-13 00:31:16 -08:00
|
|
|
}
|
|
|
|
|
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
void setTimedoutError() {
|
|
|
|
|
setTimedoutError_ = true;
|
2019-12-13 00:31:16 -08:00
|
|
|
}
|
|
|
|
|
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
void resetTimedoutError() {
|
|
|
|
|
setTimedoutError_ = false;
|
2019-12-13 00:31:16 -08:00
|
|
|
}
|
|
|
|
|
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
// In the constructor of ProcessGroupNCCL. We don't allow the watchdog thread
|
|
|
|
|
// to run any handling or desync report when the main thread is block wait.
|
|
|
|
|
// Even if users set handling and turn on desyncDebug flag, they will get
|
|
|
|
|
// reset. For the ease of unit test, we want the main thread to be block wait,
|
|
|
|
|
// so we have this hack to manually set the desync debug flag after PG
|
|
|
|
|
// creation.
|
|
|
|
|
void forceSetDesyncDebugFlag() {
|
2025-06-12 13:56:35 -07:00
|
|
|
watchdog_->setDesyncDebug(true);
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
}
|
|
|
|
|
|
2019-12-13 00:31:16 -08:00
|
|
|
private:
|
2024-10-10 18:05:32 +00:00
|
|
|
bool setTimedoutError_{false};
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class ProcessGroupNCCLNoHeartbeatCaught
|
|
|
|
|
: public ProcessGroupNCCLTimedOutErrors {
|
|
|
|
|
public:
|
|
|
|
|
ProcessGroupNCCLNoHeartbeatCaught(
|
|
|
|
|
const c10::intrusive_ptr<c10d::Store>& store,
|
|
|
|
|
int rank,
|
|
|
|
|
int size,
|
|
|
|
|
c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> opts)
|
2025-06-03 15:02:59 -07:00
|
|
|
: ProcessGroupNCCLTimedOutErrors(store, rank, size, std::move(opts)) {
|
|
|
|
|
// Override the heartbeat monitor function to make sure that we capture
|
|
|
|
|
// the exception in the monitor thread because we cannot try-catch it in
|
|
|
|
|
// the main thread and we set a flag for the main thread to check.
|
|
|
|
|
heartbeatMonitor_ = std::make_unique<TestHeartbeatMonitor>(this);
|
|
|
|
|
}
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
|
2024-09-03 22:35:14 +00:00
|
|
|
std::mutex& getWatchdogMutex() {
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
return workMetaListMutex_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool getErrorCaughtFlag() {
|
|
|
|
|
return hasMonitorThreadCaughtError_;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-16 20:01:48 -08:00
|
|
|
void forceTryWriteDebugInfo() {
|
2024-02-23 08:41:23 -08:00
|
|
|
std::future<bool> asyncDebugDump = std::async(
|
|
|
|
|
std::launch::async, [this]() { return this->dumpDebuggingInfo(); });
|
2023-12-07 10:17:16 -08:00
|
|
|
asyncDebugDump.wait();
|
2023-11-16 20:01:48 -08:00
|
|
|
}
|
|
|
|
|
|
2025-06-03 15:02:59 -07:00
|
|
|
class TestHeartbeatMonitor : public c10d::ProcessGroupNCCL::HeartbeatMonitor {
|
|
|
|
|
public:
|
|
|
|
|
using HeartbeatMonitor::HeartbeatMonitor;
|
|
|
|
|
|
|
|
|
|
void runLoop() override {
|
|
|
|
|
try {
|
|
|
|
|
c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop();
|
|
|
|
|
} catch (std::runtime_error& e) {
|
|
|
|
|
// Safe cast because we know it's a ProcessGroupNCCLNoHeartbeatCaught
|
|
|
|
|
auto* pg = static_cast<ProcessGroupNCCLNoHeartbeatCaught*>(pg_);
|
|
|
|
|
pg->hasMonitorThreadCaughtError_ = true;
|
|
|
|
|
}
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
}
|
2025-06-03 15:02:59 -07:00
|
|
|
};
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
|
2025-06-03 15:02:59 -07:00
|
|
|
protected:
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
// It's really hard to unit test std::abort. So we override it instead.
|
|
|
|
|
// Commented this override, we do see process aborted with core dump without
|
|
|
|
|
// this override.
|
2024-10-10 18:05:32 +00:00
|
|
|
void terminateProcess(const std::string& errMsg) override {
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
throw std::runtime_error(errMsg);
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-10 18:05:32 +00:00
|
|
|
bool hasMonitorThreadCaughtError_{false};
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class ProcessGroupNCCLDebugInfoStuck
|
|
|
|
|
: public ProcessGroupNCCLNoHeartbeatCaught {
|
|
|
|
|
public:
|
|
|
|
|
ProcessGroupNCCLDebugInfoStuck(
|
|
|
|
|
const c10::intrusive_ptr<c10d::Store>& store,
|
|
|
|
|
int rank,
|
|
|
|
|
int size,
|
|
|
|
|
c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> opts)
|
2024-10-10 18:05:32 +00:00
|
|
|
: ProcessGroupNCCLNoHeartbeatCaught(store, rank, size, std::move(opts)) {}
|
2019-12-13 00:31:16 -08:00
|
|
|
};
|
|
|
|
|
|
2019-08-22 16:10:29 -07:00
|
|
|
class ProcessGroupNCCLErrorsTest : public ::testing::Test {
|
|
|
|
|
protected:
|
2020-07-28 16:51:22 -07:00
|
|
|
bool skipTest() {
|
2019-10-07 17:37:58 -07:00
|
|
|
if (cudaNumDevices() == 0) {
|
2020-07-28 16:51:22 -07:00
|
|
|
LOG(INFO) << "Skipping test since CUDA is not available";
|
|
|
|
|
return true;
|
2019-10-07 17:37:58 -07:00
|
|
|
}
|
|
|
|
|
#ifdef USE_C10D_NCCL
|
2020-07-28 16:51:22 -07:00
|
|
|
if (torch::cuda::nccl::version() < kNcclErrorHandlingVersion) {
|
|
|
|
|
LOG(INFO) << "Skipping test since NCCL version is too old";
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2019-10-07 17:37:58 -07:00
|
|
|
#endif
|
2020-07-28 16:51:22 -07:00
|
|
|
return false;
|
2019-08-22 16:10:29 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void SetUp() override {
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
// Enable LOG(INFO) messages.
|
|
|
|
|
c10::initLogging();
|
2024-02-07 22:29:25 +00:00
|
|
|
// Need to have this check for at SetUp to make sure we only run the test --
|
|
|
|
|
// including the init -- when there are GPUs available.
|
|
|
|
|
if (skipTest()) {
|
|
|
|
|
GTEST_SKIP() << "Skipping ProcessGroupNCCLErrorsTest because system "
|
|
|
|
|
<< "requirement is not met (no CUDA or GPU).";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t numDevices = 1; // One device per rank (thread)
|
2019-08-22 16:10:29 -07:00
|
|
|
TemporaryFile file;
|
2020-11-11 22:49:06 -08:00
|
|
|
store_ = c10::make_intrusive<::c10d::FileStore>(file.path, 1);
|
2019-08-22 16:10:29 -07:00
|
|
|
|
|
|
|
|
tensors_.resize(numDevices);
|
2024-02-07 22:29:25 +00:00
|
|
|
tensors_[0] = at::empty({3, 3}, at::kCUDA);
|
2019-08-22 16:10:29 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void TearDown() override {
|
2023-11-21 07:23:42 +00:00
|
|
|
ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "0", 1) == 0);
|
2019-08-22 16:10:29 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<at::Tensor> tensors_;
|
2020-11-11 22:49:06 -08:00
|
|
|
c10::intrusive_ptr<::c10d::FileStore> store_;
|
2019-08-22 16:10:29 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsBlocking) {
|
2023-11-21 07:23:42 +00:00
|
|
|
ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "1", 1) == 0);
|
2020-11-11 22:49:06 -08:00
|
|
|
auto options = c10d::ProcessGroupNCCL::Options::create();
|
2021-03-17 18:38:15 -07:00
|
|
|
options->timeout = std::chrono::milliseconds(1000);
|
2020-02-05 15:26:06 -08:00
|
|
|
ProcessGroupNCCLSimulateErrors pg(store_, 0, 1, options);
|
2019-08-22 16:10:29 -07:00
|
|
|
|
|
|
|
|
auto work = pg.allreduce(tensors_);
|
|
|
|
|
work->wait();
|
|
|
|
|
EXPECT_EQ(1, pg.getNCCLCommCacheSize());
|
|
|
|
|
|
|
|
|
|
// Now run all reduce with errors.
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
pg.simulateError();
|
2019-08-22 16:10:29 -07:00
|
|
|
work = pg.allreduce(tensors_);
|
|
|
|
|
// Verify the work item failed.
|
|
|
|
|
EXPECT_THROW(work->wait(), std::runtime_error);
|
|
|
|
|
}
|
2020-02-05 15:26:06 -08:00
|
|
|
|
2019-12-13 00:31:16 -08:00
|
|
|
TEST_F(ProcessGroupNCCLErrorsTest, testNCCLTimedoutErrorsBlocking) {
|
2023-11-21 07:23:42 +00:00
|
|
|
ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "1", 1) == 0);
|
2020-11-11 22:49:06 -08:00
|
|
|
auto options = c10d::ProcessGroupNCCL::Options::create();
|
2021-03-17 18:38:15 -07:00
|
|
|
options->timeout = std::chrono::milliseconds(3000);
|
2020-02-05 15:26:06 -08:00
|
|
|
ProcessGroupNCCLTimedOutErrors pg(store_, 0, 1, options);
|
2019-12-13 00:31:16 -08:00
|
|
|
|
|
|
|
|
auto work = pg.allreduce(tensors_);
|
|
|
|
|
work->wait();
|
|
|
|
|
EXPECT_EQ(1, pg.getNCCLCommCacheSize());
|
|
|
|
|
|
|
|
|
|
// Now run all reduce with errors.
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
pg.setTimedoutError();
|
2019-12-13 00:31:16 -08:00
|
|
|
work = pg.allreduce(tensors_);
|
2023-10-31 05:58:21 +00:00
|
|
|
EXPECT_THROW(work->wait(), c10::DistBackendError);
|
2019-12-13 00:31:16 -08:00
|
|
|
|
2020-02-05 15:26:06 -08:00
|
|
|
// Communicators might be aborted here, further operations would fail.
|
2019-12-13 00:31:16 -08:00
|
|
|
}
|
2019-08-22 16:10:29 -07:00
|
|
|
|
|
|
|
|
TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNonBlocking) {
|
[PGNCCL] Add an API to get the status/error code at the PG level (#144498)
Summary:
This PR is basically a replacement of
https://github.com/pytorch/pytorch/pull/140087, which caused some perf
drop due to frequent TCPStore check in watchdog thread. The fix is to move the
tcpstore check in monitoring thread
If unhealthy, the user should be able to get the type of errors, e.g.,
timeout,nccl error or remote error.
This API is applied to PG level, compared to the
work.get_future_result() API which is applied to Work Level.
Error detection at PG level is much more convenient for users to handle
the PG failure as a whole, e.g, restarting the PG.
Error handling at the work level is still useful for users to attach
work specific context and debug the RC of the specific failing
work/collective
Note it is critical for all ranks in the PG to be notified about an
error as soon as it occurs, so we introduce an errorType of
REMOTE_ERROR, which is 'broadcasted' from a src rank (which detects a
local error) to all other ranks in the PG, the broadcast is done through
TCPStore currently
Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/144498
Approved by: https://github.com/kwen2501
2025-01-23 16:03:29 -08:00
|
|
|
// Avoid watchdog thread to throw the exception and FR dumps to test the
|
|
|
|
|
// barrier throw behavior.
|
2024-10-09 13:48:16 -07:00
|
|
|
ASSERT_TRUE(
|
|
|
|
|
setenv(c10d::TORCH_NCCL_ASYNC_ERROR_HANDLING[0].c_str(), "0", 1) == 0);
|
[PGNCCL] Add an API to get the status/error code at the PG level (#144498)
Summary:
This PR is basically a replacement of
https://github.com/pytorch/pytorch/pull/140087, which caused some perf
drop due to frequent TCPStore check in watchdog thread. The fix is to move the
tcpstore check in monitoring thread
If unhealthy, the user should be able to get the type of errors, e.g.,
timeout,nccl error or remote error.
This API is applied to PG level, compared to the
work.get_future_result() API which is applied to Work Level.
Error detection at PG level is much more convenient for users to handle
the PG failure as a whole, e.g, restarting the PG.
Error handling at the work level is still useful for users to attach
work specific context and debug the RC of the specific failing
work/collective
Note it is critical for all ranks in the PG to be notified about an
error as soon as it occurs, so we introduce an errorType of
REMOTE_ERROR, which is 'broadcasted' from a src rank (which detects a
local error) to all other ranks in the PG, the broadcast is done through
TCPStore currently
Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/144498
Approved by: https://github.com/kwen2501
2025-01-23 16:03:29 -08:00
|
|
|
ASSERT_TRUE(setenv(c10d::TORCH_NCCL_PROPAGATE_ERROR[0].c_str(), "1", 1) == 0);
|
2020-11-11 22:49:06 -08:00
|
|
|
auto options = c10d::ProcessGroupNCCL::Options::create();
|
2021-03-17 18:38:15 -07:00
|
|
|
options->timeout = std::chrono::milliseconds(3000);
|
2020-02-05 15:26:06 -08:00
|
|
|
ProcessGroupNCCLSimulateErrors pg(store_, 0, 1, options);
|
2019-08-22 16:10:29 -07:00
|
|
|
|
|
|
|
|
auto work = pg.allreduce(tensors_);
|
|
|
|
|
pg.barrier()->wait();
|
|
|
|
|
EXPECT_EQ(1, pg.getNCCLCommCacheSize());
|
|
|
|
|
|
|
|
|
|
// Now run all reduce with errors.
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
pg.simulateError();
|
2019-08-22 16:10:29 -07:00
|
|
|
work = pg.allreduce(tensors_);
|
|
|
|
|
|
|
|
|
|
work->wait();
|
2024-10-09 13:48:16 -07:00
|
|
|
// a NCCL ERROR happened before should stop the thread from passing the
|
|
|
|
|
// barrier.
|
|
|
|
|
EXPECT_THROW(pg.barrier()->wait(), std::runtime_error);
|
2019-08-22 16:10:29 -07:00
|
|
|
}
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
|
2023-11-15 20:45:14 -08:00
|
|
|
// Function to read what we wrote to the local disk for validation.
|
|
|
|
|
std::string readTraceFromFile(const std::string& filename, size_t size) {
|
|
|
|
|
std::ifstream file(filename, std::ios::binary);
|
|
|
|
|
// Read the strings from the file
|
|
|
|
|
if (file) { // While the file stream is in good state
|
|
|
|
|
std::string str(size, '\0');
|
2024-10-10 18:05:32 +00:00
|
|
|
file.read(&str[0], static_cast<std::streamsize>(size));
|
2023-11-15 20:45:14 -08:00
|
|
|
if (file) {
|
|
|
|
|
return str;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return "";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Extend the nested class outside the parent class
|
|
|
|
|
class TestDebugInfoWriter : public c10d::DebugInfoWriter {
|
|
|
|
|
public:
|
2024-10-10 18:05:32 +00:00
|
|
|
TestDebugInfoWriter(const std::string& namePrefix)
|
2024-01-02 16:52:41 -08:00
|
|
|
: DebugInfoWriter(namePrefix, 0) {}
|
2023-11-15 20:45:14 -08:00
|
|
|
|
|
|
|
|
void write(const std::string& ncclTrace) override {
|
|
|
|
|
traces_.assign(ncclTrace.begin(), ncclTrace.end());
|
|
|
|
|
c10d::DebugInfoWriter::write(ncclTrace);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<uint8_t>& getTraces() {
|
|
|
|
|
return traces_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
std::vector<uint8_t> traces_;
|
|
|
|
|
};
|
|
|
|
|
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
|
[Reland] Launch kernel on current stream & remove `record_stream` entirely (#150398)
Relanding #148590 due to merge conflict.
This PR has multiple changes to `ProcessGroupNCCL` (which unfortunately are related):
1. When async_op=False, we directly launch the collective on "current" stream, instead of a trampoline stream and join back.
- Resolves #147729
- Resolves #146881
- Also saves two event syncs (which have overhead in case of HIP) and one pybind when we call `work.wait()` in distributed_c10d.py on behalf of user.
2. Entirely remove `record_stream` and use CPU-side stashing for managing tensor lifetime against recycling.
- Resolves #147168
3. Remove tensor life management when async_op=False; only use it when async_op=True.
4. To guard against user not calling `work.wait()`, we ask watchdog to unstash tensors after detecting completion of collectives, to prevent us from holding reference to tensors forever. This is a safety net, rather than a service guarantee, see discussion [here](https://github.com/pytorch/pytorch/issues/147168#issuecomment-2660142460).
5. Profile in async_op=False mode would look different -- collective kernels would show up in the same line and compute kernels.
Joint work with @cenzhaometa who wants to remove the event sync overhead.
Squashed contents:
* [ptd][nccl] use current-stream as nccl-stream under async=False mode (#147820)
PTD current workflow:
- PTD creates its own dedicated `ncclStream` for comm operation
- it will first add a dependency on current-stream (typically the compute stream) to ensure tensors are ready before invoking collective
such stream synchronization become expensive in Inference world (cpu overhead: 70us vs GPU kernel time: 160us).
This diff:
- async=False [default], will use current-stream as nccl-stream and avoid the stream-sync overhead
- async=True, will retain existing logic: create new nccl-stream, let it wait on current-stream to ensure tensors are ready
- pass down async from c10d down to NCCL-PG
this helps shave off 50% CPU overhead **(70us -> 35us)**, which reduce total CPU/GPU from **230us to 195us by 15%**
* [PGNCCL] Make avoid-record-stream default
* [c10d] Add asyncOp argument to Ops
* Change python side wait
* Pass asyncOp at ProcessGroup level
* Watchdog unstashing tensors as a safety net
* Stash tensors for reduce_scatter_v and all_gather_v
Pull Request approved: https://github.com/pytorch/pytorch/pull/149753
* [c10d] Move unstashing from watchdog to main thread
Pull Request approved: https://github.com/pytorch/pytorch/pull/150079
* [PGNCCL][BE] Merge mutex into TensorShelf for encapsulation
Pull Request approved: https://github.com/pytorch/pytorch/pull/150130
Pull Request resolved: https://github.com/pytorch/pytorch/pull/150398
Approved by: https://github.com/atalman
2025-03-31 23:58:44 -07:00
|
|
|
// Note (kwen2501) 03/07/2025
|
|
|
|
|
// TODO: re-enable
|
|
|
|
|
GTEST_SKIP() << "Skipping test as the trace write seems unstable.";
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
int heartBeatIntervalInSec = 2;
|
|
|
|
|
std::string timeInterval = std::to_string(heartBeatIntervalInSec);
|
2024-10-15 17:31:28 -07:00
|
|
|
ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "0", 1) == 0);
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
ASSERT_TRUE(
|
2023-11-19 03:48:58 +00:00
|
|
|
setenv(
|
|
|
|
|
c10d::TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC[0].c_str(),
|
|
|
|
|
timeInterval.c_str(),
|
|
|
|
|
1) == 0);
|
|
|
|
|
ASSERT_TRUE(
|
|
|
|
|
setenv(c10d::TORCH_NCCL_ENABLE_MONITORING[0].c_str(), "1", 1) == 0);
|
2023-11-15 20:45:14 -08:00
|
|
|
auto tempFilename = c10::str(
|
2025-08-22 09:03:31 +00:00
|
|
|
std::filesystem::temp_directory_path().string(), "/comm_lib_trace_rank_");
|
2023-11-15 20:45:14 -08:00
|
|
|
ASSERT_TRUE(
|
|
|
|
|
setenv("TORCH_NCCL_DEBUG_INFO_TEMP_FILE", tempFilename.c_str(), 1) == 0);
|
|
|
|
|
// Enable nccl flight recorder.
|
|
|
|
|
ASSERT_TRUE(setenv("TORCH_NCCL_TRACE_BUFFER_SIZE", "10", 1) == 0);
|
2024-05-03 13:38:04 -07:00
|
|
|
ASSERT_TRUE(setenv(c10d::TORCH_NCCL_DUMP_ON_TIMEOUT[0].c_str(), "1", 1) == 0);
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
auto options = c10d::ProcessGroupNCCL::Options::create();
|
|
|
|
|
// Set a long watchdog timeout, so that we have enough time to lock the
|
|
|
|
|
// watchdog and let the heartbeat monitor thread to kick in.
|
|
|
|
|
options->timeout = std::chrono::milliseconds(30000);
|
|
|
|
|
ProcessGroupNCCLNoHeartbeatCaught pg(store_, 0, 1, options);
|
2023-11-15 20:45:14 -08:00
|
|
|
// The storer here is very similar to the fallback storer.
|
|
|
|
|
// The only difference is that we are storing traces also in memory for
|
|
|
|
|
// validation.
|
2024-01-02 16:52:41 -08:00
|
|
|
std::string fileNamePrefix = c10d::getCvarString(
|
2025-08-22 09:03:31 +00:00
|
|
|
{"TORCH_NCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/comm_lib_trace_rank_");
|
2023-11-15 20:45:14 -08:00
|
|
|
std::unique_ptr<TestDebugInfoWriter> wrterForTestPtr =
|
2024-01-02 16:52:41 -08:00
|
|
|
std::make_unique<TestDebugInfoWriter>(fileNamePrefix);
|
2023-11-15 20:45:14 -08:00
|
|
|
std::vector<uint8_t>& traces = wrterForTestPtr->getTraces();
|
2024-01-02 16:52:41 -08:00
|
|
|
c10d::DebugInfoWriter::registerWriter(std::move(wrterForTestPtr));
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
|
|
|
|
|
// Normal collective case.
|
|
|
|
|
auto work = pg.allreduce(tensors_);
|
|
|
|
|
work->wait();
|
|
|
|
|
|
|
|
|
|
work = pg.allreduce(tensors_);
|
|
|
|
|
{
|
|
|
|
|
// Now run all reduce with errors.
|
2024-09-03 22:35:14 +00:00
|
|
|
std::lock_guard<std::mutex> lock(pg.getWatchdogMutex());
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
LOG(INFO) << "Lock watchdog thread.";
|
|
|
|
|
// Wait long enough before monitor thread throws exceptions.
|
|
|
|
|
std::this_thread::sleep_for(
|
|
|
|
|
std::chrono::seconds(heartBeatIntervalInSec * 3));
|
|
|
|
|
// Check the monitoring thread launched and exception thrown.
|
|
|
|
|
EXPECT_TRUE(pg.getErrorCaughtFlag());
|
|
|
|
|
}
|
|
|
|
|
work->wait();
|
2024-10-10 18:05:32 +00:00
|
|
|
EXPECT_TRUE(!traces.empty());
|
2023-11-15 20:45:14 -08:00
|
|
|
auto filename = c10::str(tempFilename, 0);
|
|
|
|
|
auto traceFromStorage = readTraceFromFile(filename, traces.size());
|
|
|
|
|
// Check the traces read from storage match with the original nccl trace.
|
|
|
|
|
EXPECT_TRUE(traceFromStorage == std::string(traces.begin(), traces.end()));
|
|
|
|
|
std::filesystem::remove(filename);
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
class ProcessGroupNCCLWatchdogTimeoutTest : public ProcessGroupNCCLErrorsTest {
|
|
|
|
|
protected:
|
|
|
|
|
void SetUp() override {
|
2024-01-19 02:33:31 +00:00
|
|
|
// TODO (kwen2501)
|
|
|
|
|
GTEST_SKIP() << "Skipping tests under ProcessGroupNCCLWatchdogTimeoutTest; "
|
|
|
|
|
<< "will rewrite them after refactoring Work queues.";
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
ProcessGroupNCCLErrorsTest::SetUp();
|
|
|
|
|
std::string timeInterval = std::to_string(heartBeatIntervalInSec);
|
2023-11-21 07:23:42 +00:00
|
|
|
ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "1", 1) == 0);
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
ASSERT_TRUE(
|
|
|
|
|
setenv(
|
2023-11-19 03:48:58 +00:00
|
|
|
c10d::TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC[0].c_str(),
|
|
|
|
|
timeInterval.c_str(),
|
|
|
|
|
1) == 0);
|
|
|
|
|
ASSERT_TRUE(
|
|
|
|
|
setenv(c10d::TORCH_NCCL_ENABLE_MONITORING[0].c_str(), "1", 1) == 0);
|
2023-11-21 07:23:42 +00:00
|
|
|
ASSERT_TRUE(setenv(c10d::TORCH_NCCL_DESYNC_DEBUG[0].c_str(), "1", 1) == 0);
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
// We cannot capture the exception thrown in watchdog thread without making
|
|
|
|
|
// lots of changes to the code. So we don't let the watchdog throw
|
|
|
|
|
// exception.
|
2023-11-19 03:48:58 +00:00
|
|
|
ASSERT_TRUE(
|
2023-11-21 07:23:42 +00:00
|
|
|
setenv(c10d::TORCH_NCCL_ASYNC_ERROR_HANDLING[0].c_str(), "0", 1) == 0);
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
options_ = c10d::ProcessGroupNCCL::Options::create();
|
|
|
|
|
// Set a super short watchdog timeout.
|
|
|
|
|
options_->timeout = std::chrono::milliseconds(100);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void watchdogTimeoutTestCommon(
|
|
|
|
|
ProcessGroupNCCLNoHeartbeatCaught& pg,
|
|
|
|
|
int multiplier) {
|
|
|
|
|
pg.forceSetDesyncDebugFlag();
|
|
|
|
|
pg.setTimedoutError();
|
|
|
|
|
auto work = pg.allreduce(tensors_);
|
|
|
|
|
std::this_thread::sleep_for(
|
|
|
|
|
std::chrono::seconds(heartBeatIntervalInSec * multiplier));
|
|
|
|
|
EXPECT_THROW(work->wait(), c10::DistBackendError);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const int heartBeatIntervalInSec = 2;
|
|
|
|
|
c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> options_;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
TEST_F(ProcessGroupNCCLWatchdogTimeoutTest, testNCCLTimedoutDebugInfoFinished) {
|
|
|
|
|
ProcessGroupNCCLNoHeartbeatCaught pg(store_, 0, 1, options_);
|
2023-11-16 20:01:48 -08:00
|
|
|
// Write debug info will lead to watchdog thread to wait for 30 seconds.
|
|
|
|
|
// And this is hard to override, so we just call it before hand. Otherwise,
|
|
|
|
|
// we need to set a long heartbeat timeout which will make the test way
|
|
|
|
|
// slower.
|
|
|
|
|
pg.forceTryWriteDebugInfo();
|
[NCCL PG] ADD a separate monitoring thread to ensure we collect debug info and check watchdog heartbeat (#112518)
This PR has the following goals:
1. Detect unhealthy nccl watchdog thread by implementing a heartbeat. NCCL watchdog sometimes can hang for several reasons such as nccl/cuda API bugs or unexpected blocking behaviors. This is the last resort to ensure that we don't silently keep the training job run for hours.
2. Sometimes, the process gets stuck in the destroy of NCCL PG, and this PR will ensure that we will eventually abort it after some time (by default 2 mins)
3. Once heartbeat cannot be heard, we dump debug information (for now, we just use the flight recorder implemented in https://github.com/pytorch/pytorch/pull/110960/files) to disk. (How and where to dump the debug info will be addressed in the following PR).
4. Finally, we initiate std::abort via `LOG(FATAL)` to kill the process.
To clarify further what this PR is trying to solve, we first list are four cases when a NCCL PG can end up with:
- case 1: ncclwatchdog gets stuck (maybe some blocking API) and heartbeat monitor kills it during regular heartbeat monitor loop.
- case 2: ncclwatchdog timeout and desync report or destroy kicked in(let's call it shutdown) but this shutdown takes so long and heartbeat believes it has to kills the process anyway.
- case 3: ncclwatchdog aborts the process (heartbeat monitor not involved)
- case 4: program exits cleanly (heartbeat monitor not involved)
As we can see here, this PR is trying to address case one and two and we also want to ensure adding one more monitor thread does not interfere what we are currently doing in case three and four. That's why we added two flags `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_`.
For case three and four, either `monitorWakeUpCV_` will be waked up in the destructor or `terminateHeartbeatMonitorThread_` will be set to true. So that monitor thread will just exit ASAP.
For case one, both `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will still false when monitor thread see there are no heartbeat, so it will directly kill the process. For case two, either `terminateHeartbeatMonitorThread_` and `collectiveDebugInfoMode_` will be true, the monitor thread will wait extra time before killing the process.
Differential Revision: [D51146305](https://our.internmc.facebook.com/intern/diff/D51146305)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112518
Approved by: https://github.com/kwen2501, https://github.com/wconstab
2023-11-09 13:55:49 -08:00
|
|
|
watchdogTimeoutTestCommon(pg, 2);
|
|
|
|
|
|
|
|
|
|
// The flag is false shows that the heartbeat monitor thread does not
|
|
|
|
|
// trigger process abort if getting debug info and destroy PG is fast.
|
|
|
|
|
EXPECT_FALSE(pg.getErrorCaughtFlag());
|
|
|
|
|
|
|
|
|
|
// Communicators might be aborted here, further operations would fail.
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST_F(ProcessGroupNCCLWatchdogTimeoutTest, testNCCLTimedoutDebugInfoStuck) {
|
|
|
|
|
ProcessGroupNCCLDebugInfoStuck pg(store_, 0, 1, options_);
|
|
|
|
|
// Need to keep main thread sleep longer so that we can let heartbeat monitor
|
|
|
|
|
// thread to finish the extra wait and flip the flag.
|
|
|
|
|
watchdogTimeoutTestCommon(pg, 4);
|
|
|
|
|
// The flag is true shows that the heartbeat monitor thread does trigger
|
|
|
|
|
// process abort if getting debug info gets stuck.
|
|
|
|
|
EXPECT_TRUE(pg.getErrorCaughtFlag());
|
|
|
|
|
|
|
|
|
|
// Communicators might be aborted here, further operations would fail.
|
|
|
|
|
}
|