mirror of
https://github.com/zebrajr/pytorch.git
synced 2026-01-15 12:15:51 +00:00
Revert "[SymmMem] Skip multicast init if any CUDA call fails (#168049)"
This reverts commit 8cb8b6cbbd.
Reverted https://github.com/pytorch/pytorch/pull/168049 on behalf of https://github.com/yangw-dev due to D87346992 internal error that conflict the main branch, please rebase and try to merge again These changes have conflicts when merging with master branch. Rebase this diff. ([comment](https://github.com/pytorch/pytorch/pull/168049#issuecomment-3552985895))
This commit is contained in:
@@ -20,22 +20,6 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define C10_CUDA_DRIVER_CHECK_GOTO(EXPR, NEXT) \
|
||||
do { \
|
||||
CUresult __err = EXPR; \
|
||||
if (__err != CUDA_SUCCESS) { \
|
||||
const char* err_str; \
|
||||
CUresult get_error_str_err [[maybe_unused]] = \
|
||||
c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
|
||||
if (get_error_str_err != CUDA_SUCCESS) { \
|
||||
TORCH_WARN("CUDA driver error: unknown error"); \
|
||||
} else { \
|
||||
TORCH_WARN("CUDA driver error: ", err_str); \
|
||||
} \
|
||||
goto NEXT; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// The integer in the second column specifies the requested CUDA Driver API
|
||||
// version. The dynamic loader will accept a driver with a newer version, but it
|
||||
// ensures that the requested symbol exists in *at least* the specified version
|
||||
|
||||
@@ -517,11 +517,6 @@ static void init_multicast_for_block(
|
||||
using McHandleType =
|
||||
std::conditional_t<use_fabric_handle, CUmemFabricHandle, int>;
|
||||
|
||||
McHandleType invalidator;
|
||||
std::memset(&invalidator, UINT8_MAX, sizeof(McHandleType));
|
||||
|
||||
// Phase 1: export handle (rank 0 only)
|
||||
McHandleType mc_exported_handle{};
|
||||
if (rank == 0) {
|
||||
CUmulticastObjectProp mc_prop{};
|
||||
mc_prop.numDevices = world_size;
|
||||
@@ -530,82 +525,68 @@ static void init_multicast_for_block(
|
||||
|
||||
// create a multicast object, which acts as a handle that allows multiple
|
||||
// devices or processes to access the same memory allocation coherently.
|
||||
try {
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
driver_api->cuMulticastCreate_(&mc_handle, &mc_prop));
|
||||
// using the CUDA Driver API to export a multicast object into a POSIX file
|
||||
// descriptor.
|
||||
C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_(
|
||||
&mc_exported_handle, mc_handle, handleType, 0));
|
||||
} catch (const std::exception& e) {
|
||||
// Allow peers gracefully skip multicast initialization by sending -1
|
||||
mc_exported_handle = invalidator;
|
||||
auto err = driver_api->cuMulticastCreate_(&mc_handle, &mc_prop);
|
||||
if (err != CUDA_SUCCESS) {
|
||||
const char* err_str;
|
||||
CUresult get_error_str_err = driver_api->cuGetErrorString_(err, &err_str);
|
||||
if (get_error_str_err != CUDA_SUCCESS) {
|
||||
err_str = "unknown cuda driver error";
|
||||
}
|
||||
LOG(WARNING)
|
||||
<< "SymmetricMemory: fail to export multicast handle.\n"
|
||||
<< e.what();
|
||||
<< "SymmetricMemory: cuMulticastCreate failed with: \"" << err_str
|
||||
<< "\". Gracefully skipping multicast initialization. "
|
||||
<< "However, this is unexpected. Please report the issue on GitHub.";
|
||||
// Allow peers gracefully skip multicast initialization by sending -1
|
||||
// TODO: allow graceful skip for fabric
|
||||
if constexpr (!use_fabric_handle) {
|
||||
ipc_channel.broadcast_fds(rank, 0, pids, -1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Exchange handle
|
||||
McHandleType recv_handle;
|
||||
if constexpr (!use_fabric_handle) {
|
||||
recv_handle = ipc_channel.broadcast_fds(rank, 0, pids, mc_exported_handle);
|
||||
} else {
|
||||
// TODO implement storeExchange.broadcast
|
||||
auto gathered_handles = storeExchange.all_gather(store, rank, world_size, mc_exported_handle);
|
||||
recv_handle = std::move(gathered_handles[0]);
|
||||
}
|
||||
|
||||
// Check exchange result
|
||||
if (memcmp(&recv_handle, &invalidator, sizeof(McHandleType)) == 0) {
|
||||
LOG(WARNING) << "Gracefully skipping multicast initialization.";
|
||||
return;
|
||||
}
|
||||
|
||||
// Flip to true after all CUDA steps finish
|
||||
bool success_end = false;
|
||||
|
||||
// Phase 3: Import handle (non-0 ranks only)
|
||||
if (rank != 0) {
|
||||
McHandleType mc_exported_handle;
|
||||
// using the CUDA Driver API to export a multicast object into a POSIX file
|
||||
// descriptor.
|
||||
C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_(
|
||||
&mc_exported_handle, mc_handle, handleType, 0));
|
||||
if constexpr (!use_fabric_handle) {
|
||||
// Convert back to a handle from the broadcasted POSIX file descriptor.
|
||||
C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMemImportFromShareableHandle_(
|
||||
&mc_handle,
|
||||
(void*)(uintptr_t)recv_handle,
|
||||
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR), check_all);
|
||||
ipc_channel.broadcast_fds(rank, 0, pids, mc_exported_handle);
|
||||
// Ref count is incremented as soon as SCM_RIGHTS send happens
|
||||
close(mc_exported_handle);
|
||||
} else {
|
||||
C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMemImportFromShareableHandle_(
|
||||
&mc_handle, (void*)&(recv_handle), CU_MEM_HANDLE_TYPE_FABRIC), check_all);
|
||||
// TODO implement storeExchange.broadcast
|
||||
storeExchange.all_gather(store, rank, world_size, mc_exported_handle);
|
||||
}
|
||||
|
||||
} else {
|
||||
if constexpr (!use_fabric_handle) {
|
||||
int mc_fd = ipc_channel.broadcast_fds(rank, 0, pids, -1);
|
||||
if (mc_fd == -1) {
|
||||
return;
|
||||
}
|
||||
// Convert back to a handle from the broadcasted POSIX file descriptor.
|
||||
C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_(
|
||||
&mc_handle,
|
||||
(void*)(uintptr_t)mc_fd,
|
||||
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
|
||||
close(mc_fd);
|
||||
} else {
|
||||
CUmemFabricHandle null_handle{};
|
||||
auto mc_handles =
|
||||
storeExchange.all_gather(store, rank, world_size, null_handle);
|
||||
C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_(
|
||||
&mc_handle, (void*)&(mc_handles[0]), CU_MEM_HANDLE_TYPE_FABRIC));
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 4: Bind memory
|
||||
// All rank adds their physical allocation to the multicast object
|
||||
C10_CUDA_DRIVER_CHECK_GOTO(
|
||||
driver_api->cuMulticastAddDevice_(mc_handle, block->device_idx), check_all);
|
||||
C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMulticastBindMem_(
|
||||
mc_handle, 0, block->alloc_ref->handle, 0, block->block_size, 0), check_all);
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
driver_api->cuMulticastAddDevice_(mc_handle, block->device_idx));
|
||||
C10_CUDA_DRIVER_CHECK(driver_api->cuMulticastBindMem_(
|
||||
mc_handle, 0, block->alloc_ref->handle, 0, block->block_size, 0));
|
||||
|
||||
success_end = true;
|
||||
|
||||
check_all:
|
||||
// Whether all ranks have succeeded
|
||||
bool all_succeed = true;
|
||||
auto rank_successes = storeExchange.all_gather(store, rank, world_size, success_end);
|
||||
for (int r = 0; r < world_size; ++r) {
|
||||
all_succeed &= rank_successes[r];
|
||||
}
|
||||
// Close the file descriptor before exit
|
||||
if constexpr (!use_fabric_handle) {
|
||||
close(recv_handle);
|
||||
}
|
||||
if (!all_succeed) {
|
||||
LOG(WARNING) << "Gracefully skipping multicast initialization.";
|
||||
return;
|
||||
}
|
||||
|
||||
// Phase 5: Map to virtual memory
|
||||
map_block(&mc_addr, mc_handle, block->block_size, block->device_idx);
|
||||
storeExchange.barrier(store, rank, world_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user