From 5abb7bf8fee800e92028e57ebbb41e2e9f62d499 Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Wed, 19 Nov 2025 14:26:53 +0000 Subject: [PATCH] Revert "[SymmMem] Skip multicast init if any CUDA call fails (#168049)" This reverts commit 8cb8b6cbbdbfc790be2921c768dab403157671ef. Reverted https://github.com/pytorch/pytorch/pull/168049 on behalf of https://github.com/yangw-dev due to D87346992 internal error that conflict the main branch, please rebase and try to merge again These changes have conflicts when merging with master branch. Rebase this diff. ([comment](https://github.com/pytorch/pytorch/pull/168049#issuecomment-3552985895)) --- c10/cuda/driver_api.h | 16 --- .../c10d/symm_mem/CUDASymmetricMemory.cu | 121 ++++++++---------- 2 files changed, 51 insertions(+), 86 deletions(-) diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h index 1ff0c9a12ac..380e7939ff7 100644 --- a/c10/cuda/driver_api.h +++ b/c10/cuda/driver_api.h @@ -20,22 +20,6 @@ } \ } while (0) -#define C10_CUDA_DRIVER_CHECK_GOTO(EXPR, NEXT) \ - do { \ - CUresult __err = EXPR; \ - if (__err != CUDA_SUCCESS) { \ - const char* err_str; \ - CUresult get_error_str_err [[maybe_unused]] = \ - c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \ - if (get_error_str_err != CUDA_SUCCESS) { \ - TORCH_WARN("CUDA driver error: unknown error"); \ - } else { \ - TORCH_WARN("CUDA driver error: ", err_str); \ - } \ - goto NEXT; \ - } \ - } while (0) - // The integer in the second column specifies the requested CUDA Driver API // version. The dynamic loader will accept a driver with a newer version, but it // ensures that the requested symbol exists in *at least* the specified version diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu index f83d42df4ac..6352330c387 100644 --- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu @@ -517,11 +517,6 @@ static void init_multicast_for_block( using McHandleType = std::conditional_t; - McHandleType invalidator; - std::memset(&invalidator, UINT8_MAX, sizeof(McHandleType)); - - // Phase 1: export handle (rank 0 only) - McHandleType mc_exported_handle{}; if (rank == 0) { CUmulticastObjectProp mc_prop{}; mc_prop.numDevices = world_size; @@ -530,82 +525,68 @@ static void init_multicast_for_block( // create a multicast object, which acts as a handle that allows multiple // devices or processes to access the same memory allocation coherently. - try { - C10_CUDA_DRIVER_CHECK( - driver_api->cuMulticastCreate_(&mc_handle, &mc_prop)); - // using the CUDA Driver API to export a multicast object into a POSIX file - // descriptor. - C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_( - &mc_exported_handle, mc_handle, handleType, 0)); - } catch (const std::exception& e) { - // Allow peers gracefully skip multicast initialization by sending -1 - mc_exported_handle = invalidator; + auto err = driver_api->cuMulticastCreate_(&mc_handle, &mc_prop); + if (err != CUDA_SUCCESS) { + const char* err_str; + CUresult get_error_str_err = driver_api->cuGetErrorString_(err, &err_str); + if (get_error_str_err != CUDA_SUCCESS) { + err_str = "unknown cuda driver error"; + } LOG(WARNING) - << "SymmetricMemory: fail to export multicast handle.\n" - << e.what(); + << "SymmetricMemory: cuMulticastCreate failed with: \"" << err_str + << "\". Gracefully skipping multicast initialization. " + << "However, this is unexpected. Please report the issue on GitHub."; + // Allow peers gracefully skip multicast initialization by sending -1 + // TODO: allow graceful skip for fabric + if constexpr (!use_fabric_handle) { + ipc_channel.broadcast_fds(rank, 0, pids, -1); + } + return; } - } - // Phase 2: Exchange handle - McHandleType recv_handle; - if constexpr (!use_fabric_handle) { - recv_handle = ipc_channel.broadcast_fds(rank, 0, pids, mc_exported_handle); - } else { - // TODO implement storeExchange.broadcast - auto gathered_handles = storeExchange.all_gather(store, rank, world_size, mc_exported_handle); - recv_handle = std::move(gathered_handles[0]); - } - - // Check exchange result - if (memcmp(&recv_handle, &invalidator, sizeof(McHandleType)) == 0) { - LOG(WARNING) << "Gracefully skipping multicast initialization."; - return; - } - - // Flip to true after all CUDA steps finish - bool success_end = false; - - // Phase 3: Import handle (non-0 ranks only) - if (rank != 0) { + McHandleType mc_exported_handle; + // using the CUDA Driver API to export a multicast object into a POSIX file + // descriptor. + C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_( + &mc_exported_handle, mc_handle, handleType, 0)); if constexpr (!use_fabric_handle) { - // Convert back to a handle from the broadcasted POSIX file descriptor. - C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMemImportFromShareableHandle_( - &mc_handle, - (void*)(uintptr_t)recv_handle, - CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR), check_all); + ipc_channel.broadcast_fds(rank, 0, pids, mc_exported_handle); + // Ref count is incremented as soon as SCM_RIGHTS send happens + close(mc_exported_handle); } else { - C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMemImportFromShareableHandle_( - &mc_handle, (void*)&(recv_handle), CU_MEM_HANDLE_TYPE_FABRIC), check_all); + // TODO implement storeExchange.broadcast + storeExchange.all_gather(store, rank, world_size, mc_exported_handle); + } + + } else { + if constexpr (!use_fabric_handle) { + int mc_fd = ipc_channel.broadcast_fds(rank, 0, pids, -1); + if (mc_fd == -1) { + return; + } + // Convert back to a handle from the broadcasted POSIX file descriptor. + C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_( + &mc_handle, + (void*)(uintptr_t)mc_fd, + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); + close(mc_fd); + } else { + CUmemFabricHandle null_handle{}; + auto mc_handles = + storeExchange.all_gather(store, rank, world_size, null_handle); + C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_( + &mc_handle, (void*)&(mc_handles[0]), CU_MEM_HANDLE_TYPE_FABRIC)); } } - // Phase 4: Bind memory // All rank adds their physical allocation to the multicast object - C10_CUDA_DRIVER_CHECK_GOTO( - driver_api->cuMulticastAddDevice_(mc_handle, block->device_idx), check_all); - C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMulticastBindMem_( - mc_handle, 0, block->alloc_ref->handle, 0, block->block_size, 0), check_all); + C10_CUDA_DRIVER_CHECK( + driver_api->cuMulticastAddDevice_(mc_handle, block->device_idx)); + C10_CUDA_DRIVER_CHECK(driver_api->cuMulticastBindMem_( + mc_handle, 0, block->alloc_ref->handle, 0, block->block_size, 0)); - success_end = true; - -check_all: - // Whether all ranks have succeeded - bool all_succeed = true; - auto rank_successes = storeExchange.all_gather(store, rank, world_size, success_end); - for (int r = 0; r < world_size; ++r) { - all_succeed &= rank_successes[r]; - } - // Close the file descriptor before exit - if constexpr (!use_fabric_handle) { - close(recv_handle); - } - if (!all_succeed) { - LOG(WARNING) << "Gracefully skipping multicast initialization."; - return; - } - - // Phase 5: Map to virtual memory map_block(&mc_addr, mc_handle, block->block_size, block->device_idx); + storeExchange.barrier(store, rank, world_size); #endif }