Revert "[SymmMem] Skip multicast init if any CUDA call fails (#168049)"

This reverts commit 8cb8b6cbbd.

Reverted https://github.com/pytorch/pytorch/pull/168049 on behalf of https://github.com/yangw-dev due to D87346992 internal error that conflict the main branch, please rebase and try to merge again These changes have conflicts when merging with master branch. Rebase this diff. ([comment](https://github.com/pytorch/pytorch/pull/168049#issuecomment-3552985895))
This commit is contained in:
PyTorch MergeBot
2025-11-19 14:26:53 +00:00
parent a0ccd3e5ff
commit 5abb7bf8fe
2 changed files with 51 additions and 86 deletions

View File

@@ -20,22 +20,6 @@
} \
} while (0)
#define C10_CUDA_DRIVER_CHECK_GOTO(EXPR, NEXT) \
do { \
CUresult __err = EXPR; \
if (__err != CUDA_SUCCESS) { \
const char* err_str; \
CUresult get_error_str_err [[maybe_unused]] = \
c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
if (get_error_str_err != CUDA_SUCCESS) { \
TORCH_WARN("CUDA driver error: unknown error"); \
} else { \
TORCH_WARN("CUDA driver error: ", err_str); \
} \
goto NEXT; \
} \
} while (0)
// The integer in the second column specifies the requested CUDA Driver API
// version. The dynamic loader will accept a driver with a newer version, but it
// ensures that the requested symbol exists in *at least* the specified version

View File

@@ -517,11 +517,6 @@ static void init_multicast_for_block(
using McHandleType =
std::conditional_t<use_fabric_handle, CUmemFabricHandle, int>;
McHandleType invalidator;
std::memset(&invalidator, UINT8_MAX, sizeof(McHandleType));
// Phase 1: export handle (rank 0 only)
McHandleType mc_exported_handle{};
if (rank == 0) {
CUmulticastObjectProp mc_prop{};
mc_prop.numDevices = world_size;
@@ -530,82 +525,68 @@ static void init_multicast_for_block(
// create a multicast object, which acts as a handle that allows multiple
// devices or processes to access the same memory allocation coherently.
try {
C10_CUDA_DRIVER_CHECK(
driver_api->cuMulticastCreate_(&mc_handle, &mc_prop));
// using the CUDA Driver API to export a multicast object into a POSIX file
// descriptor.
C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_(
&mc_exported_handle, mc_handle, handleType, 0));
} catch (const std::exception& e) {
// Allow peers gracefully skip multicast initialization by sending -1
mc_exported_handle = invalidator;
auto err = driver_api->cuMulticastCreate_(&mc_handle, &mc_prop);
if (err != CUDA_SUCCESS) {
const char* err_str;
CUresult get_error_str_err = driver_api->cuGetErrorString_(err, &err_str);
if (get_error_str_err != CUDA_SUCCESS) {
err_str = "unknown cuda driver error";
}
LOG(WARNING)
<< "SymmetricMemory: fail to export multicast handle.\n"
<< e.what();
<< "SymmetricMemory: cuMulticastCreate failed with: \"" << err_str
<< "\". Gracefully skipping multicast initialization. "
<< "However, this is unexpected. Please report the issue on GitHub.";
// Allow peers gracefully skip multicast initialization by sending -1
// TODO: allow graceful skip for fabric
if constexpr (!use_fabric_handle) {
ipc_channel.broadcast_fds(rank, 0, pids, -1);
}
return;
}
}
// Phase 2: Exchange handle
McHandleType recv_handle;
if constexpr (!use_fabric_handle) {
recv_handle = ipc_channel.broadcast_fds(rank, 0, pids, mc_exported_handle);
} else {
// TODO implement storeExchange.broadcast
auto gathered_handles = storeExchange.all_gather(store, rank, world_size, mc_exported_handle);
recv_handle = std::move(gathered_handles[0]);
}
// Check exchange result
if (memcmp(&recv_handle, &invalidator, sizeof(McHandleType)) == 0) {
LOG(WARNING) << "Gracefully skipping multicast initialization.";
return;
}
// Flip to true after all CUDA steps finish
bool success_end = false;
// Phase 3: Import handle (non-0 ranks only)
if (rank != 0) {
McHandleType mc_exported_handle;
// using the CUDA Driver API to export a multicast object into a POSIX file
// descriptor.
C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_(
&mc_exported_handle, mc_handle, handleType, 0));
if constexpr (!use_fabric_handle) {
// Convert back to a handle from the broadcasted POSIX file descriptor.
C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMemImportFromShareableHandle_(
&mc_handle,
(void*)(uintptr_t)recv_handle,
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR), check_all);
ipc_channel.broadcast_fds(rank, 0, pids, mc_exported_handle);
// Ref count is incremented as soon as SCM_RIGHTS send happens
close(mc_exported_handle);
} else {
C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMemImportFromShareableHandle_(
&mc_handle, (void*)&(recv_handle), CU_MEM_HANDLE_TYPE_FABRIC), check_all);
// TODO implement storeExchange.broadcast
storeExchange.all_gather(store, rank, world_size, mc_exported_handle);
}
} else {
if constexpr (!use_fabric_handle) {
int mc_fd = ipc_channel.broadcast_fds(rank, 0, pids, -1);
if (mc_fd == -1) {
return;
}
// Convert back to a handle from the broadcasted POSIX file descriptor.
C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_(
&mc_handle,
(void*)(uintptr_t)mc_fd,
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
close(mc_fd);
} else {
CUmemFabricHandle null_handle{};
auto mc_handles =
storeExchange.all_gather(store, rank, world_size, null_handle);
C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_(
&mc_handle, (void*)&(mc_handles[0]), CU_MEM_HANDLE_TYPE_FABRIC));
}
}
// Phase 4: Bind memory
// All rank adds their physical allocation to the multicast object
C10_CUDA_DRIVER_CHECK_GOTO(
driver_api->cuMulticastAddDevice_(mc_handle, block->device_idx), check_all);
C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMulticastBindMem_(
mc_handle, 0, block->alloc_ref->handle, 0, block->block_size, 0), check_all);
C10_CUDA_DRIVER_CHECK(
driver_api->cuMulticastAddDevice_(mc_handle, block->device_idx));
C10_CUDA_DRIVER_CHECK(driver_api->cuMulticastBindMem_(
mc_handle, 0, block->alloc_ref->handle, 0, block->block_size, 0));
success_end = true;
check_all:
// Whether all ranks have succeeded
bool all_succeed = true;
auto rank_successes = storeExchange.all_gather(store, rank, world_size, success_end);
for (int r = 0; r < world_size; ++r) {
all_succeed &= rank_successes[r];
}
// Close the file descriptor before exit
if constexpr (!use_fabric_handle) {
close(recv_handle);
}
if (!all_succeed) {
LOG(WARNING) << "Gracefully skipping multicast initialization.";
return;
}
// Phase 5: Map to virtual memory
map_block(&mc_addr, mc_handle, block->block_size, block->device_idx);
storeExchange.barrier(store, rank, world_size);
#endif
}