Integrate hermetic nvshmem repository in XLA and TF projects.

TF wheel build rule implementation is also updated to exclude accidental dependencies on NVSHMEM libraries in the wheel content. If the wheel needs to be built with these dependencies, provide `--@local_config_nvshmem//:override_include_nvshmem_libs=True` in Bazel options.

`NVSHMEM` binaries are included in the dependencies if `CUDA` binary dependencies are added as well, e.g. `--@local_config_cuda//:enable_cuda`.

`NVSHMEM` libraries are included in the dependencies if `--@local_config_nvshmem//:include_nvshmem_libs=True` (the default flag value is `False`). Please note that this is a temporary solution, and it should be removed after GLIBC is updated on RBE runners. At the moment `libnvshmem.so` files can't be linked to the targets because they are built with GLIBC version higher then on RBE runners. In the future `--@local_config_cuda//cuda:include_cuda_libs=True` should be used.

The next change will contain adding `NVSHMEM` deps to individual Bazel targets via `select`.

PiperOrigin-RevId: 769344482
This commit is contained in:
A. Unique TensorFlower
2025-06-09 16:41:27 -07:00
committed by TensorFlower Gardener
parent 53a6c65b77
commit 707048b1b9
17 changed files with 211 additions and 26 deletions

View File

@@ -266,9 +266,10 @@ build:mkl_aarch64 -c opt
build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true
build:mkl_aarch64_threadpool -c opt
# Default CUDA and CUDNN versions.
# Default CUDA, CUDNN and NVSHMEM versions.
build:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.5.1"
build:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
build:cuda_version --repo_env=HERMETIC_NVSHMEM_VERSION="3.2.5"
# CUDA: This config refers to building CUDA op kernels with nvcc.
build:cuda --repo_env TF_NEED_CUDA=1
@@ -280,6 +281,7 @@ build:cuda --@local_config_cuda//cuda:include_cuda_libs=true
# This configuration is used for building the wheels.
build:cuda_wheel --@local_config_cuda//cuda:include_cuda_libs=false
build:cuda_wheel --@local_config_nvshmem//:include_nvshmem_libs=false
# CUDA: This config refers to building CUDA op kernels with clang.
build:cuda_clang --config=cuda

View File

@@ -138,3 +138,30 @@ load(
)
nccl_configure(name = "local_config_nccl")
load(
"@local_xla//third_party/nvshmem/hermetic:nvshmem_json_init_repository.bzl",
"nvshmem_json_init_repository",
)
nvshmem_json_init_repository()
load(
"@nvshmem_redist_json//:distributions.bzl",
"NVSHMEM_REDISTRIBUTIONS",
)
load(
"@local_xla//third_party/nvshmem/hermetic:nvshmem_redist_init_repository.bzl",
"nvshmem_redist_init_repository",
)
nvshmem_redist_init_repository(
nvshmem_redistributions = NVSHMEM_REDISTRIBUTIONS,
)
load(
"@local_xla//third_party/nvshmem/hermetic:nvshmem_configure.bzl",
"nvshmem_configure",
)
nvshmem_configure(name = "local_config_nvshmem")

View File

@@ -1091,6 +1091,7 @@ bzl_library(
"@local_xla//third_party/llvm_openmp:openmp_bzl",
"@local_xla//third_party/py/rules_pywrap:pywrap_bzl",
"@local_xla//xla/tsl:tsl_bzl",
"@local_xla//xla/tsl:tsl_default_bzl",
"@local_xla//xla/tsl/mkl:build_defs_bzl",
"@rules_java//java:rules",
],

View File

@@ -1,7 +1,7 @@
load("@bazel_skylib//lib:selects.bzl", "selects")
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
load("@local_xla//xla/tsl:tsl.bzl", "if_cuda_libs")
load("@local_xla//xla/tsl:tsl.default.bzl", "if_cuda_libs")
load(
"//tensorflow:tensorflow.bzl",
"clean_dep",

View File

@@ -72,10 +72,13 @@ load(
_cc_header_only_library = "cc_header_only_library",
_custom_op_cc_header_only_library = "custom_op_cc_header_only_library",
_if_cuda_or_rocm = "if_cuda_or_rocm",
_if_cuda_tools = "if_cuda_tools",
_if_nccl = "if_nccl",
_transitive_hdrs = "transitive_hdrs",
)
load(
"@local_xla//xla/tsl:tsl.default.bzl",
_if_cuda_tools = "if_cuda_tools",
)
load(
"@local_config_tensorrt//:build_defs.bzl",
"if_tensorrt",

View File

@@ -77,11 +77,18 @@ def _is_dest_file(basename, dest_files_suffixes):
def _tf_wheel_impl(ctx):
include_cuda_libs = ctx.attr.include_cuda_libs[BuildSettingInfo].value
override_include_cuda_libs = ctx.attr.override_include_cuda_libs[BuildSettingInfo].value
include_nvshmem_libs = ctx.attr.include_nvshmem_libs[BuildSettingInfo].value
override_include_nvshmem_libs = ctx.attr.override_include_nvshmem_libs[BuildSettingInfo].value
if include_cuda_libs and not override_include_cuda_libs:
fail("TF wheel shouldn't be built with CUDA dependencies." +
" Please provide `--config=cuda_wheel` for bazel build command." +
" If you absolutely need to add CUDA dependencies, provide" +
" `--@local_config_cuda//cuda:override_include_cuda_libs=true`.")
if include_nvshmem_libs and not override_include_nvshmem_libs:
fail("TF wheel shouldn't be built directly against the NVSHMEM libraries." +
" Please provide `--config=cuda_wheel` for bazel build command." +
" If you absolutely need to build links directly against the NVSHMEM libraries," +
" `provide --@local_config_nvshmem//:override_include_nvshmem_libs=true`.")
executable = ctx.executable.wheel_binary
full_wheel_version = (TF_VERSION + TF_WHEEL_VERSION_SUFFIX)
@@ -147,6 +154,8 @@ tf_wheel = rule(
),
"include_cuda_libs": attr.label(default = Label("@local_config_cuda//cuda:include_cuda_libs")),
"override_include_cuda_libs": attr.label(default = Label("@local_config_cuda//cuda:override_include_cuda_libs")),
"include_nvshmem_libs": attr.label(default = Label("@local_config_nvshmem//:include_nvshmem_libs")),
"override_include_nvshmem_libs": attr.label(default = Label("@local_config_nvshmem//:override_include_nvshmem_libs")),
"platform_tag": attr.string(mandatory = True),
"platform_name": attr.string(mandatory = True),
},

View File

@@ -99,3 +99,30 @@ load(
)
nccl_configure(name = "local_config_nccl")
load(
"//third_party/nvshmem/hermetic:nvshmem_json_init_repository.bzl",
"nvshmem_json_init_repository",
)
nvshmem_json_init_repository()
load(
"@nvshmem_redist_json//:distributions.bzl",
"NVSHMEM_REDISTRIBUTIONS",
)
load(
"//third_party/nvshmem/hermetic:nvshmem_redist_init_repository.bzl",
"nvshmem_redist_init_repository",
)
nvshmem_redist_init_repository(
nvshmem_redistributions = NVSHMEM_REDISTRIBUTIONS,
)
load(
"@local_xla//third_party/nvshmem/hermetic:nvshmem_configure.bzl",
"nvshmem_configure",
)
nvshmem_configure(name = "local_config_nvshmem")

View File

@@ -162,9 +162,10 @@ build:mkl_aarch64 -c opt
build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true
build:mkl_aarch64_threadpool -c opt
# Default CUDA and CUDNN versions.
# Default CUDA, CUDNN and NVSHMEM versions.
build:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.6.3"
build:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
build:cuda_version --repo_env=HERMETIC_NVSHMEM_VERSION="3.2.5"
# CUDA: This config refers to building CUDA op kernels with nvcc.
build:cuda --repo_env TF_NEED_CUDA=1
@@ -176,6 +177,7 @@ build:cuda --@local_config_cuda//cuda:include_cuda_libs=true
# This configuration is used for building the wheels.
build:cuda_wheel --@local_config_cuda//cuda:include_cuda_libs=false
build:cuda_wheel --@local_config_nvshmem//:include_nvshmem_libs=false
# CUDA: This config refers to building CUDA op kernels with clang.
build:cuda_clang --config=cuda

View File

@@ -26,30 +26,64 @@ load(
)
NVSHMEM_ENABLED_BUILD_CONTENT = """
load("@bazel_skylib//lib:selects.bzl", "selects")
load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting")
# This set of flags and config_settings is needed to enable NVSHMEM dependencies
# separately from CUDA dependencies. The reason is that NVSHMEM libraries
# require GLIBC 2.28 and above, which we don't have on RBE runners yet.
# TODO(ybaturina): Remove this once GLIBC 2.28 is available on RBE.
bool_flag(
name = "include_nvshmem_libs",
build_setting_default = False,
visibility = ["//visibility:public"],
)
config_setting(
name = "nvshmem_libs",
flag_values = {":include_nvshmem_libs": "True"},
visibility = ["//visibility:private"],
)
bool_setting(
name = "true_setting",
visibility = ["//visibility:private"],
build_setting_default = True,
bool_flag(
name = "override_include_nvshmem_libs",
build_setting_default = False,
visibility = ["//visibility:public"],
)
config_setting(
name = "overrided_nvshmem_libs",
flag_values = {":true_setting": "False"},
visibility = ["//visibility:private"],
)
alias(
name = "nvshmem_tools",
flag_values = {":true_setting": "True"},
actual = "@local_config_cuda//:is_cuda_enabled",
visibility = ["//visibility:public"],
)
selects.config_setting_group(
name = "any_nvshmem_libs",
match_any = [
":nvshmem_libs",
":overrided_nvshmem_libs",
],
visibility = ["//visibility:private"],
)
selects.config_setting_group(
name = "nvshmem_tools_and_libs",
match_all = [
":any_nvshmem_libs",
":nvshmem_tools",
],
visibility = ["//visibility:public"],
)
"""
NVSHMEM_DISABLED_BUILD_CONTENT = """
load("@bazel_skylib//lib:selects.bzl", "selects")
load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting")
bool_setting(
@@ -58,14 +92,52 @@ bool_setting(
build_setting_default = True,
)
bool_flag(
name = "include_nvshmem_libs",
build_setting_default = False,
visibility = ["//visibility:public"],
)
config_setting(
name = "nvshmem_tools",
flag_values = {":true_setting": "False"},
visibility = ["//visibility:public"],
)
config_setting(
name = "nvshmem_libs",
flag_values = {":true_setting": "False"},
visibility = ["//visibility:private"],
)
bool_flag(
name = "override_include_nvshmem_libs",
build_setting_default = False,
visibility = ["//visibility:public"],
)
config_setting(
name = "overrided_nvshmem_libs",
flag_values = {":true_setting": "False"},
visibility = ["//visibility:private"],
)
selects.config_setting_group(
name = "any_nvshmem_libs",
match_any = [
":nvshmem_libs",
":overrided_nvshmem_libs"
],
visibility = ["//visibility:private"],
)
selects.config_setting_group(
name = "nvshmem_tools_and_libs",
match_all = [
":any_nvshmem_libs",
":nvshmem_tools"
],
visibility = ["//visibility:public"],
)
"""

View File

@@ -1454,6 +1454,7 @@ bzl_library(
"@bazel_skylib//lib:paths",
"//xla/tsl:package_groups_bzl",
"//xla/tsl:tsl_bzl",
"//xla/tsl:tsl_default_bzl",
"//xla/tsl/platform/default:cuda_build_defs_bzl",
],
)

View File

@@ -3,7 +3,8 @@
load("@bazel_skylib//lib:paths.bzl", "paths")
load("@rules_python//python:defs.bzl", "py_binary")
load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
load("//xla/tsl:tsl.bzl", "if_cuda_tools", "if_google", "if_oss")
load("//xla/tsl:tsl.bzl", "if_google", "if_oss")
load("//xla/tsl:tsl.default.bzl", "if_cuda_tools")
load("//xla/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
visibility(DEFAULT_LOAD_VISIBILITY)

View File

@@ -14,14 +14,19 @@ load(
load("//xla/tests:build_defs.bzl", "xla_test")
load(
"//xla/tsl:tsl.bzl",
# copybara:comment_begin
"if_cuda_tools",
# copybara:comment_end
"if_google",
"if_windows",
"internal_visibility",
"tsl_copts",
)
# copybara:comment_begin
load(
"//xla/tsl:tsl.default.bzl",
"if_cuda_tools",
)
# copybara:comment_end
load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
load(
"//xla/tsl/platform:build_config_root.bzl",

View File

@@ -553,6 +553,16 @@ bzl_library(
visibility = ["//xla:__subpackages__"],
)
bzl_library(
name = "tsl_default_bzl",
srcs = if_oss(["tsl.default.bzl"]),
visibility = ["//visibility:public"],
deps = [
":package_groups_bzl",
":tsl_bzl",
],
)
# copybara:comment_begin(oss-only)
cc_library(
name = "grpc++",

View File

@@ -11,7 +11,7 @@ load(
"if_cuda_is_configured",
)
load(
"//xla/tsl:tsl.bzl",
"//xla/tsl:tsl.default.bzl",
"if_cuda_libs",
)
load("//xla/tsl/cuda:stub.bzl", "cuda_stub")

View File

@@ -3,14 +3,18 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
load(
"//xla/tsl:tsl.bzl",
"if_cuda_tools",
"if_not_fuchsia",
"if_not_windows",
"if_oss",
"internal_visibility",
"tsl_copts",
)
load("//xla/tsl:tsl.default.bzl", "filegroup", "tsl_grpc_cc_dependencies")
load(
"//xla/tsl:tsl.default.bzl",
"filegroup",
"if_cuda_tools",
"tsl_grpc_cc_dependencies",
)
load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
package(

View File

@@ -233,16 +233,6 @@ def if_with_tpu_support(if_true, if_false = []):
"//conditions:default": if_false,
})
# These configs are used to determine whether we should use CUDA tools and libs in cc_libraries.
# They are intended for the OSS builds only.
def if_cuda_tools(if_true, if_false = []): # buildifier: disable=unused-variable
"""Shorthand for select()'ing on whether we're building with hCUDA tools."""
return select({"@local_config_cuda//cuda:cuda_tools": if_true, "//conditions:default": if_false}) # copybara:comment_replace return if_false
def if_cuda_libs(if_true, if_false = []): # buildifier: disable=unused-variable
"""Shorthand for select()'ing on whether we need to include hermetic CUDA libraries."""
return select({"@local_config_cuda//cuda:cuda_tools_and_libs": if_true, "//conditions:default": if_false}) # copybara:comment_replace return if_false
def get_win_copts(is_external = False):
WINDOWS_COPTS = [
# copybara:uncomment_begin(no MSVC flags in google)

View File

@@ -27,3 +27,34 @@ tsl_pybind_extension = _tsl_pybind_extension
tsl_google_bzl_deps = _tsl_google_bzl_deps
tsl_extra_config_settings = _tsl_extra_config_settings
tsl_extra_config_settings_targets = _tsl_extra_config_settings_targets
# These configs are used to determine whether we should use CUDA/NVSHMEM tools and libs in
# cc_libraries.
# They are intended for the OSS builds only.
def if_cuda_tools(if_true, if_false = []): # buildifier: disable=unused-variable
"""Shorthand for select()'ing on whether we're building with hermetic CUDA tools."""
return select({
"@local_config_cuda//cuda:cuda_tools": if_true,
"//conditions:default": if_false,
})
def if_cuda_libs(if_true, if_false = []): # buildifier: disable=unused-variable
"""Shorthand for select()'ing on whether we need to include hermetic CUDA libraries."""
return select({
"@local_config_cuda//cuda:cuda_tools_and_libs": if_true,
"//conditions:default": if_false,
})
def if_nvshmem_tools(if_true, if_false = []): # buildifier: disable=unused-variable
"""Shorthand for select()'ing on whether we're building with hermetic NVSHMEM tools."""
return select({
"@local_config_nvshmem//:nvshmem_tools": if_true,
"//conditions:default": if_false,
})
def if_nvshmem_libs(if_true, if_false = []): # buildifier: disable=unused-variable
"""Shorthand for select()'ing on whether we need to include hermetic NVSHMEM libraries."""
return select({
"@local_config_nvshmem//:nvshmem_tools_and_libs": if_true,
"//conditions:default": if_false,
})