2019-09-02 02:58:21 -07:00
|
|
|
#include <c10/cuda/CUDAGuard.h>
|
2021-02-24 18:15:06 -08:00
|
|
|
#include <c10/util/irange.h>
|
2018-10-19 09:47:02 -07:00
|
|
|
|
2021-06-11 05:04:04 -07:00
|
|
|
#include <ATen/cuda/CUDAContext.h>
|
2020-07-31 08:51:15 -07:00
|
|
|
#include <gtest/gtest.h>
|
2022-09-30 05:13:48 +00:00
|
|
|
#include <torch/csrc/distributed/c10d/FileStore.hpp>
|
|
|
|
|
#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
|
2021-06-24 12:37:29 -07:00
|
|
|
#include "CUDATest.hpp"
|
2024-10-14 22:07:29 +00:00
|
|
|
#include "TestUtils.hpp"
|
2018-06-01 09:54:45 -07:00
|
|
|
|
|
|
|
|
using namespace c10d::test;
|
|
|
|
|
|
2018-10-19 09:47:02 -07:00
|
|
|
using at::cuda::CUDAStream;
|
2018-06-01 09:54:45 -07:00
|
|
|
|
|
|
|
|
template <typename T, typename... Args>
|
2024-10-10 18:05:32 +00:00
|
|
|
std::vector<T> initialize(const std::string& path, size_t N, Args&&... args) {
|
2018-06-01 09:54:45 -07:00
|
|
|
std::vector<T> tests;
|
2024-10-19 13:17:43 +00:00
|
|
|
for ([[maybe_unused]] const auto i : c10::irange(N)) {
|
2018-06-01 09:54:45 -07:00
|
|
|
tests.push_back(std::move(T(path, std::forward<Args>(args)...)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::thread> threads;
|
2024-10-19 13:17:43 +00:00
|
|
|
for ([[maybe_unused]] const auto i : c10::irange(N)) {
|
2018-10-19 09:47:02 -07:00
|
|
|
threads.push_back(std::thread([i, N, &tests] { tests[i].start(i, N); }));
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (auto& thread : threads) {
|
|
|
|
|
thread.join();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return tests;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
class AsyncTest {
|
|
|
|
|
public:
|
2021-12-09 21:59:50 -08:00
|
|
|
AsyncTest(std::string path) : path_(std::move(path)) {}
|
2018-06-01 09:54:45 -07:00
|
|
|
|
2024-10-10 18:05:32 +00:00
|
|
|
AsyncTest(AsyncTest&& other) noexcept = default;
|
2018-06-01 09:54:45 -07:00
|
|
|
|
|
|
|
|
::c10d::ProcessGroupGloo& getProcessGroup() {
|
|
|
|
|
return *pg_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void start(int rank, int size) {
|
2020-11-11 22:49:06 -08:00
|
|
|
auto store = c10::make_intrusive<::c10d::FileStore>(path_, size);
|
2018-06-01 09:54:45 -07:00
|
|
|
|
|
|
|
|
// Use tiny timeout to make this test run fast
|
2021-03-17 18:38:15 -07:00
|
|
|
auto options = ::c10d::ProcessGroupGloo::Options::create();
|
|
|
|
|
options->timeout = std::chrono::milliseconds(50);
|
|
|
|
|
options->devices.push_back(
|
2019-09-05 07:08:12 -07:00
|
|
|
::c10d::ProcessGroupGloo::createDeviceForHostname("127.0.0.1"));
|
2018-06-01 09:54:45 -07:00
|
|
|
|
2024-10-10 18:05:32 +00:00
|
|
|
pg_ =
|
|
|
|
|
std::make_unique<::c10d::ProcessGroupGloo>(store, rank, size, options);
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected:
|
|
|
|
|
std::string path_;
|
|
|
|
|
std::unique_ptr<::c10d::ProcessGroupGloo> pg_;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class AsyncInputIsOutputTest : public AsyncTest {
|
|
|
|
|
public:
|
|
|
|
|
AsyncInputIsOutputTest(const std::string& path, int numTensors)
|
|
|
|
|
: AsyncTest(path),
|
|
|
|
|
numTensors_(numTensors),
|
2021-12-08 12:14:23 -08:00
|
|
|
numDevices_(cudaNumDevices()) {
|
2018-06-01 09:54:45 -07:00
|
|
|
// Allocate inputs on available devices in a round robin fashion.
|
2024-10-17 20:58:56 +00:00
|
|
|
::at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
|
2018-06-01 09:54:45 -07:00
|
|
|
inputs_.resize(numTensors_);
|
2021-06-09 14:42:47 -07:00
|
|
|
for (const auto i : c10::irange(numTensors_)) {
|
2018-10-26 09:25:25 -07:00
|
|
|
inputs_[i] = at::empty(
|
|
|
|
|
{16, 16},
|
|
|
|
|
at::device(
|
|
|
|
|
{at::kCUDA, static_cast<c10::DeviceIndex>(i % numDevices_)}));
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Allocate a stream per device.
|
|
|
|
|
//
|
|
|
|
|
// The "current stream" is set globally per device in THC, so we
|
|
|
|
|
// can't make two tensors on the same device use different streams
|
|
|
|
|
// and pass this along to the collective (since it uses the THC
|
|
|
|
|
// getters to retrieve the current stream).
|
|
|
|
|
//
|
New implementations of DeviceGuard, StreamGuard and MultiStreamGuard (with CUDA specializations) (#13342)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13342
This PR introduces a few new concepts:
- DeviceGuardImplInterface, and implementations for CPU and CUDA, which
provide a generic interface for interfacing with device and stream state,
without requiring a direct dependency on the code in question.
- InlineDeviceGuard, a general template for generating both specialized
and dynamically dispatched device guard implementations. Dynamic
dispatch is done by specializing it on a VirtualGuardImpl.
- Provide a device-independent DeviceGuard class, which can be used even
from CPU code. It uses the aforementioned dynamic dispatch.
- CUDA-specialized CUDAGuard class, which doesn't have a dynamic dispatch
but can only be used from CUDA.
- StreamGuard, which is the same as above, but for streams rather than
devices.
- Optional variants of all the aforementioned guards, which are a no-op if
no device/stream is specified
- CUDAMultiStreamGuard, specifically for the case when we want to set
a device on every guard.
There are some subtle semantic changes, which have been thoroughly documented
in the class definition.
BC-breaking changes:
- Move constructor/assignment have been removed from all device guard
implementations.
- In some cases where you previously wrote 'set_device' (or 'set_stream'), you now must write
'reset_device', because if you switch devices/device types, the stream/device on the
previous device is unset. This is different from previous behavior.
- CUDAGuard no longer handles streams, or multiple streams. Use CUDAStreamGuard
or CUDAMultiStreamGuard as appropriate for your use case.
Reviewed By: dzhulgakov
Differential Revision: D12849620
fbshipit-source-id: f61956256f0b12be754b3234fcc73c2abc1be04e
2018-11-11 12:08:57 -08:00
|
|
|
at::cuda::OptionalCUDAGuard deviceGuard;
|
2018-10-29 08:24:11 -07:00
|
|
|
streams_.reserve(numDevices_);
|
2021-06-09 14:42:47 -07:00
|
|
|
for (const auto i : c10::irange(numDevices_)) {
|
2024-10-10 18:05:32 +00:00
|
|
|
deviceGuard.set_index(static_cast<c10::DeviceIndex>(i));
|
2018-10-29 08:24:11 -07:00
|
|
|
streams_.push_back(at::cuda::getStreamFromPool());
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-13 12:07:22 -07:00
|
|
|
void wait(c10::intrusive_ptr<c10d::Work>& work) {
|
2021-04-29 09:29:02 -07:00
|
|
|
c10::cuda::CUDAMultiStreamGuard guard(streams_);
|
2018-11-27 10:41:06 -08:00
|
|
|
work->wait();
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
|
|
|
|
|
2020-09-10 17:11:34 -07:00
|
|
|
std::vector<at::Tensor> getCpuTensors(
|
|
|
|
|
const std::vector<at::Tensor>& gpu_tensors) {
|
|
|
|
|
std::vector<at::Tensor> outputs(gpu_tensors.size());
|
2018-06-01 09:54:45 -07:00
|
|
|
|
|
|
|
|
// For the duration of this function, make THC use our streams
|
2021-04-29 09:29:02 -07:00
|
|
|
c10::cuda::CUDAMultiStreamGuard guard(streams_);
|
2018-06-01 09:54:45 -07:00
|
|
|
|
|
|
|
|
// Copy inputs to outputs
|
2020-09-10 17:11:34 -07:00
|
|
|
for (unsigned i = 0; i < gpu_tensors.size(); i++) {
|
|
|
|
|
outputs[i] = gpu_tensors[i].cpu();
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return outputs;
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-10 17:11:34 -07:00
|
|
|
std::vector<at::Tensor> getTensors() {
|
|
|
|
|
return getCpuTensors(inputs_);
|
|
|
|
|
}
|
|
|
|
|
|
2018-06-01 09:54:45 -07:00
|
|
|
protected:
|
2024-10-10 18:05:32 +00:00
|
|
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
|
2018-06-01 09:54:45 -07:00
|
|
|
const int numTensors_;
|
2024-10-10 18:05:32 +00:00
|
|
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
|
2018-06-01 09:54:45 -07:00
|
|
|
const int numDevices_;
|
|
|
|
|
std::vector<at::Tensor> inputs_;
|
|
|
|
|
std::vector<CUDAStream> streams_;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class AsyncAllreduceTest : public AsyncInputIsOutputTest {
|
|
|
|
|
public:
|
|
|
|
|
AsyncAllreduceTest(const std::string& path, int numTensors)
|
|
|
|
|
: AsyncInputIsOutputTest(path, numTensors) {}
|
|
|
|
|
|
2022-09-13 12:07:22 -07:00
|
|
|
c10::intrusive_ptr<c10d::Work> run() {
|
2018-06-01 09:54:45 -07:00
|
|
|
// For the duration of this function, make THC use our streams
|
2021-04-29 09:29:02 -07:00
|
|
|
c10::cuda::CUDAMultiStreamGuard guard(streams_);
|
2018-06-01 09:54:45 -07:00
|
|
|
|
|
|
|
|
// Launch sleep on every stream
|
New implementations of DeviceGuard, StreamGuard and MultiStreamGuard (with CUDA specializations) (#13342)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13342
This PR introduces a few new concepts:
- DeviceGuardImplInterface, and implementations for CPU and CUDA, which
provide a generic interface for interfacing with device and stream state,
without requiring a direct dependency on the code in question.
- InlineDeviceGuard, a general template for generating both specialized
and dynamically dispatched device guard implementations. Dynamic
dispatch is done by specializing it on a VirtualGuardImpl.
- Provide a device-independent DeviceGuard class, which can be used even
from CPU code. It uses the aforementioned dynamic dispatch.
- CUDA-specialized CUDAGuard class, which doesn't have a dynamic dispatch
but can only be used from CUDA.
- StreamGuard, which is the same as above, but for streams rather than
devices.
- Optional variants of all the aforementioned guards, which are a no-op if
no device/stream is specified
- CUDAMultiStreamGuard, specifically for the case when we want to set
a device on every guard.
There are some subtle semantic changes, which have been thoroughly documented
in the class definition.
BC-breaking changes:
- Move constructor/assignment have been removed from all device guard
implementations.
- In some cases where you previously wrote 'set_device' (or 'set_stream'), you now must write
'reset_device', because if you switch devices/device types, the stream/device on the
previous device is unset. This is different from previous behavior.
- CUDAGuard no longer handles streams, or multiple streams. Use CUDAStreamGuard
or CUDAMultiStreamGuard as appropriate for your use case.
Reviewed By: dzhulgakov
Differential Revision: D12849620
fbshipit-source-id: f61956256f0b12be754b3234fcc73c2abc1be04e
2018-11-11 12:08:57 -08:00
|
|
|
at::cuda::OptionalCUDAGuard deviceGuard;
|
2021-06-09 14:42:47 -07:00
|
|
|
for (const auto i : c10::irange(numDevices_)) {
|
2024-10-10 18:05:32 +00:00
|
|
|
deviceGuard.set_index(static_cast<c10::DeviceIndex>(i));
|
|
|
|
|
cudaSleep(streams_[i], 10ull * 1000 * 1000);
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Launch value initialization for every tensor
|
2021-06-09 14:42:47 -07:00
|
|
|
for (const auto i : c10::irange(numTensors_)) {
|
2024-10-10 18:05:32 +00:00
|
|
|
deviceGuard.set_index(static_cast<c10::DeviceIndex>(i % numDevices_));
|
2018-06-01 09:54:45 -07:00
|
|
|
inputs_[i].fill_(pg_->getRank() * numTensors_ + i);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return pg_->allreduce(inputs_);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class AsyncBroadcastTest : public AsyncInputIsOutputTest {
|
|
|
|
|
public:
|
|
|
|
|
AsyncBroadcastTest(const std::string& path, int numTensors)
|
|
|
|
|
: AsyncInputIsOutputTest(path, numTensors) {}
|
|
|
|
|
|
2024-10-10 18:05:32 +00:00
|
|
|
c10::intrusive_ptr<c10d::Work> run(size_t rootRank, size_t rootTensor) {
|
2018-06-01 09:54:45 -07:00
|
|
|
// For the duration of this function, make THC use our streams
|
2021-04-29 09:29:02 -07:00
|
|
|
c10::cuda::CUDAMultiStreamGuard guard(streams_);
|
2018-06-01 09:54:45 -07:00
|
|
|
|
|
|
|
|
// Launch sleep on every stream
|
New implementations of DeviceGuard, StreamGuard and MultiStreamGuard (with CUDA specializations) (#13342)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13342
This PR introduces a few new concepts:
- DeviceGuardImplInterface, and implementations for CPU and CUDA, which
provide a generic interface for interfacing with device and stream state,
without requiring a direct dependency on the code in question.
- InlineDeviceGuard, a general template for generating both specialized
and dynamically dispatched device guard implementations. Dynamic
dispatch is done by specializing it on a VirtualGuardImpl.
- Provide a device-independent DeviceGuard class, which can be used even
from CPU code. It uses the aforementioned dynamic dispatch.
- CUDA-specialized CUDAGuard class, which doesn't have a dynamic dispatch
but can only be used from CUDA.
- StreamGuard, which is the same as above, but for streams rather than
devices.
- Optional variants of all the aforementioned guards, which are a no-op if
no device/stream is specified
- CUDAMultiStreamGuard, specifically for the case when we want to set
a device on every guard.
There are some subtle semantic changes, which have been thoroughly documented
in the class definition.
BC-breaking changes:
- Move constructor/assignment have been removed from all device guard
implementations.
- In some cases where you previously wrote 'set_device' (or 'set_stream'), you now must write
'reset_device', because if you switch devices/device types, the stream/device on the
previous device is unset. This is different from previous behavior.
- CUDAGuard no longer handles streams, or multiple streams. Use CUDAStreamGuard
or CUDAMultiStreamGuard as appropriate for your use case.
Reviewed By: dzhulgakov
Differential Revision: D12849620
fbshipit-source-id: f61956256f0b12be754b3234fcc73c2abc1be04e
2018-11-11 12:08:57 -08:00
|
|
|
at::cuda::OptionalCUDAGuard deviceGuard;
|
2021-06-09 14:42:47 -07:00
|
|
|
for (const auto i : c10::irange(numDevices_)) {
|
2024-10-10 18:05:32 +00:00
|
|
|
deviceGuard.set_index(static_cast<c10::DeviceIndex>(i));
|
|
|
|
|
cudaSleep(streams_[i], 10ull * 1000 * 1000);
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Launch value initialization for every tensor
|
2021-06-09 14:42:47 -07:00
|
|
|
for (const auto i : c10::irange(numTensors_)) {
|
2024-10-10 18:05:32 +00:00
|
|
|
deviceGuard.set_index(static_cast<c10::DeviceIndex>(i % numDevices_));
|
2018-06-01 09:54:45 -07:00
|
|
|
inputs_[i].fill_(pg_->getRank() * numTensors_ + i);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
::c10d::BroadcastOptions options;
|
2024-10-10 18:05:32 +00:00
|
|
|
options.rootRank = static_cast<int64_t>(rootRank);
|
|
|
|
|
options.rootTensor = static_cast<int64_t>(rootTensor);
|
2018-06-01 09:54:45 -07:00
|
|
|
return pg_->broadcast(inputs_, options);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
void runAsyncAllreduceTest(
|
|
|
|
|
const std::string& path,
|
2020-07-31 08:51:15 -07:00
|
|
|
size_t numProcesses = 4,
|
|
|
|
|
size_t numTensors = 2) {
|
2018-06-01 09:54:45 -07:00
|
|
|
auto tests = initialize<AsyncAllreduceTest>(path, numProcesses, numTensors);
|
2022-09-13 12:07:22 -07:00
|
|
|
std::vector<c10::intrusive_ptr<c10d::Work>> work(numProcesses);
|
2021-06-03 01:03:11 -07:00
|
|
|
for (const auto i : c10::irange(numProcesses)) {
|
2018-06-01 09:54:45 -07:00
|
|
|
work[i] = tests[i].run();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Wait for work to complete
|
2021-06-03 01:03:11 -07:00
|
|
|
for (const auto i : c10::irange(numProcesses)) {
|
2018-06-01 09:54:45 -07:00
|
|
|
tests[i].wait(work[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check results
|
2021-06-03 01:03:11 -07:00
|
|
|
for (const auto i : c10::irange(numProcesses)) {
|
2018-06-01 09:54:45 -07:00
|
|
|
const auto size = numProcesses * numTensors;
|
|
|
|
|
const auto expected = (size * (size - 1)) / 2;
|
|
|
|
|
auto tensors = tests[i].getTensors();
|
2020-09-10 17:11:34 -07:00
|
|
|
auto results = tests[i].getCpuTensors(work[i]->result());
|
|
|
|
|
EXPECT_EQ(tensors.size(), results.size());
|
|
|
|
|
|
2021-06-03 01:03:11 -07:00
|
|
|
for (const auto j : c10::irange(tensors.size())) {
|
2018-06-01 09:54:45 -07:00
|
|
|
auto& tensor = tensors[j];
|
2019-08-21 20:09:37 -07:00
|
|
|
auto data = tensor.data_ptr<float>();
|
2020-09-10 17:11:34 -07:00
|
|
|
|
|
|
|
|
auto& result_tensor = results[j];
|
|
|
|
|
auto result_data = result_tensor.data_ptr<float>();
|
|
|
|
|
|
|
|
|
|
EXPECT_EQ(tensor.numel(), result_tensor.numel());
|
|
|
|
|
|
2021-06-09 14:42:47 -07:00
|
|
|
for (const auto k : c10::irange(tensor.numel())) {
|
2020-07-31 08:51:15 -07:00
|
|
|
EXPECT_EQ(data[k], expected);
|
2020-09-10 17:11:34 -07:00
|
|
|
EXPECT_EQ(result_data[k], expected);
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void runAsyncBroadcastTest(
|
|
|
|
|
const std::string& path,
|
2020-07-31 08:51:15 -07:00
|
|
|
size_t numProcesses = 4,
|
|
|
|
|
size_t numTensors = 1) {
|
2018-06-01 09:54:45 -07:00
|
|
|
auto tests = initialize<AsyncBroadcastTest>(path, numProcesses, numTensors);
|
|
|
|
|
|
|
|
|
|
// Try every permutation of root rank and root tensor
|
2021-06-03 01:03:11 -07:00
|
|
|
for (const auto rootRank : c10::irange(numProcesses)) {
|
|
|
|
|
for (const auto rootTensor : c10::irange(numTensors)) {
|
2022-09-13 12:07:22 -07:00
|
|
|
std::vector<c10::intrusive_ptr<c10d::Work>> work(numProcesses);
|
2021-06-03 01:03:11 -07:00
|
|
|
for (const auto i : c10::irange(numProcesses)) {
|
2018-06-01 09:54:45 -07:00
|
|
|
work[i] = tests[i].run(rootRank, rootTensor);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Wait for work to complete
|
2021-06-03 01:03:11 -07:00
|
|
|
for (const auto i : c10::irange(numProcesses)) {
|
2018-06-01 09:54:45 -07:00
|
|
|
tests[i].wait(work[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check results
|
|
|
|
|
const auto expected = (rootRank * numTensors + rootTensor);
|
2021-06-03 01:03:11 -07:00
|
|
|
for (const auto i : c10::irange(numProcesses)) {
|
2018-06-01 09:54:45 -07:00
|
|
|
auto tensors = tests[i].getTensors();
|
2021-02-24 18:15:06 -08:00
|
|
|
for (const auto& tensor : tensors) {
|
2024-04-26 00:30:01 +00:00
|
|
|
const auto* const data = tensor.const_data_ptr<float>();
|
2021-02-24 18:15:06 -08:00
|
|
|
for (const auto k : c10::irange(tensor.numel())) {
|
2020-07-31 08:51:15 -07:00
|
|
|
EXPECT_EQ(data[k], expected);
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-07-31 08:51:15 -07:00
|
|
|
#ifdef USE_CUDA
|
|
|
|
|
TEST(ProcessGroupGlooAsyncTest, testAsyncAllreduce) {
|
2019-12-03 13:25:37 -08:00
|
|
|
if (!at::cuda::is_available()) {
|
2020-07-31 08:51:15 -07:00
|
|
|
LOG(INFO) << "CUDA not available, skipping testAsyncAllreduce";
|
|
|
|
|
return;
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
2020-07-31 08:51:15 -07:00
|
|
|
TemporaryFile file;
|
|
|
|
|
runAsyncAllreduceTest(file.path);
|
|
|
|
|
}
|
2018-06-01 09:54:45 -07:00
|
|
|
|
2020-07-31 08:51:15 -07:00
|
|
|
TEST(ProcessGroupGlooAsyncTest, testAsyncBroadcast) {
|
|
|
|
|
if (!at::cuda::is_available()) {
|
|
|
|
|
LOG(INFO) << "CUDA not available, skipping testAsyncBroadcast";
|
|
|
|
|
return;
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
2020-07-31 08:51:15 -07:00
|
|
|
TemporaryFile file;
|
|
|
|
|
runAsyncBroadcastTest(file.path);
|
2018-06-01 09:54:45 -07:00
|
|
|
}
|
2020-07-31 08:51:15 -07:00
|
|
|
#endif
|