From 707048b1b94e0cd358d543f8d557bc2423735d75 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 9 Jun 2025 16:41:27 -0700 Subject: [PATCH] Integrate hermetic `nvshmem` repository in XLA and TF projects. TF wheel build rule implementation is also updated to exclude accidental dependencies on NVSHMEM libraries in the wheel content. If the wheel needs to be built with these dependencies, provide `--@local_config_nvshmem//:override_include_nvshmem_libs=True` in Bazel options. `NVSHMEM` binaries are included in the dependencies if `CUDA` binary dependencies are added as well, e.g. `--@local_config_cuda//:enable_cuda`. `NVSHMEM` libraries are included in the dependencies if `--@local_config_nvshmem//:include_nvshmem_libs=True` (the default flag value is `False`). Please note that this is a temporary solution, and it should be removed after GLIBC is updated on RBE runners. At the moment `libnvshmem.so` files can't be linked to the targets because they are built with GLIBC version higher then on RBE runners. In the future `--@local_config_cuda//cuda:include_cuda_libs=True` should be used. The next change will contain adding `NVSHMEM` deps to individual Bazel targets via `select`. PiperOrigin-RevId: 769344482 --- .bazelrc | 4 +- WORKSPACE | 27 ++++++ tensorflow/BUILD | 1 + tensorflow/core/common_runtime/gpu/BUILD | 2 +- tensorflow/tensorflow.bzl | 5 +- .../tools/pip_package/utils/tf_wheel.bzl | 9 ++ third_party/xla/WORKSPACE | 27 ++++++ third_party/xla/tensorflow.bazelrc | 4 +- .../nvshmem/hermetic/nvshmem_configure.bzl | 82 +++++++++++++++++-- third_party/xla/xla/BUILD | 1 + third_party/xla/xla/lit.bzl | 3 +- .../xla/xla/stream_executor/cuda/BUILD | 11 ++- third_party/xla/xla/tsl/BUILD | 10 +++ third_party/xla/xla/tsl/cuda/BUILD.bazel | 2 +- .../xla/xla/tsl/platform/default/BUILD | 8 +- third_party/xla/xla/tsl/tsl.bzl | 10 --- third_party/xla/xla/tsl/tsl.default.bzl | 31 +++++++ 17 files changed, 211 insertions(+), 26 deletions(-) diff --git a/.bazelrc b/.bazelrc index 325819653b2..4ad2a1ffa97 100644 --- a/.bazelrc +++ b/.bazelrc @@ -266,9 +266,10 @@ build:mkl_aarch64 -c opt build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true build:mkl_aarch64_threadpool -c opt -# Default CUDA and CUDNN versions. +# Default CUDA, CUDNN and NVSHMEM versions. build:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.5.1" build:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.3.0" +build:cuda_version --repo_env=HERMETIC_NVSHMEM_VERSION="3.2.5" # CUDA: This config refers to building CUDA op kernels with nvcc. build:cuda --repo_env TF_NEED_CUDA=1 @@ -280,6 +281,7 @@ build:cuda --@local_config_cuda//cuda:include_cuda_libs=true # This configuration is used for building the wheels. build:cuda_wheel --@local_config_cuda//cuda:include_cuda_libs=false +build:cuda_wheel --@local_config_nvshmem//:include_nvshmem_libs=false # CUDA: This config refers to building CUDA op kernels with clang. build:cuda_clang --config=cuda diff --git a/WORKSPACE b/WORKSPACE index 43a224e1df9..5fc07a20385 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -138,3 +138,30 @@ load( ) nccl_configure(name = "local_config_nccl") + +load( + "@local_xla//third_party/nvshmem/hermetic:nvshmem_json_init_repository.bzl", + "nvshmem_json_init_repository", +) + +nvshmem_json_init_repository() + +load( + "@nvshmem_redist_json//:distributions.bzl", + "NVSHMEM_REDISTRIBUTIONS", +) +load( + "@local_xla//third_party/nvshmem/hermetic:nvshmem_redist_init_repository.bzl", + "nvshmem_redist_init_repository", +) + +nvshmem_redist_init_repository( + nvshmem_redistributions = NVSHMEM_REDISTRIBUTIONS, +) + +load( + "@local_xla//third_party/nvshmem/hermetic:nvshmem_configure.bzl", + "nvshmem_configure", +) + +nvshmem_configure(name = "local_config_nvshmem") diff --git a/tensorflow/BUILD b/tensorflow/BUILD index 995156cdde6..8cf5a53dd34 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -1091,6 +1091,7 @@ bzl_library( "@local_xla//third_party/llvm_openmp:openmp_bzl", "@local_xla//third_party/py/rules_pywrap:pywrap_bzl", "@local_xla//xla/tsl:tsl_bzl", + "@local_xla//xla/tsl:tsl_default_bzl", "@local_xla//xla/tsl/mkl:build_defs_bzl", "@rules_java//java:rules", ], diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD index b65e5f720ed..eb88a66160e 100644 --- a/tensorflow/core/common_runtime/gpu/BUILD +++ b/tensorflow/core/common_runtime/gpu/BUILD @@ -1,7 +1,7 @@ load("@bazel_skylib//lib:selects.bzl", "selects") load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm") -load("@local_xla//xla/tsl:tsl.bzl", "if_cuda_libs") +load("@local_xla//xla/tsl:tsl.default.bzl", "if_cuda_libs") load( "//tensorflow:tensorflow.bzl", "clean_dep", diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 744e01823c1..c8efcc6b0ba 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -72,10 +72,13 @@ load( _cc_header_only_library = "cc_header_only_library", _custom_op_cc_header_only_library = "custom_op_cc_header_only_library", _if_cuda_or_rocm = "if_cuda_or_rocm", - _if_cuda_tools = "if_cuda_tools", _if_nccl = "if_nccl", _transitive_hdrs = "transitive_hdrs", ) +load( + "@local_xla//xla/tsl:tsl.default.bzl", + _if_cuda_tools = "if_cuda_tools", +) load( "@local_config_tensorrt//:build_defs.bzl", "if_tensorrt", diff --git a/tensorflow/tools/pip_package/utils/tf_wheel.bzl b/tensorflow/tools/pip_package/utils/tf_wheel.bzl index c4dcd4682c8..fa66b202821 100644 --- a/tensorflow/tools/pip_package/utils/tf_wheel.bzl +++ b/tensorflow/tools/pip_package/utils/tf_wheel.bzl @@ -77,11 +77,18 @@ def _is_dest_file(basename, dest_files_suffixes): def _tf_wheel_impl(ctx): include_cuda_libs = ctx.attr.include_cuda_libs[BuildSettingInfo].value override_include_cuda_libs = ctx.attr.override_include_cuda_libs[BuildSettingInfo].value + include_nvshmem_libs = ctx.attr.include_nvshmem_libs[BuildSettingInfo].value + override_include_nvshmem_libs = ctx.attr.override_include_nvshmem_libs[BuildSettingInfo].value if include_cuda_libs and not override_include_cuda_libs: fail("TF wheel shouldn't be built with CUDA dependencies." + " Please provide `--config=cuda_wheel` for bazel build command." + " If you absolutely need to add CUDA dependencies, provide" + " `--@local_config_cuda//cuda:override_include_cuda_libs=true`.") + if include_nvshmem_libs and not override_include_nvshmem_libs: + fail("TF wheel shouldn't be built directly against the NVSHMEM libraries." + + " Please provide `--config=cuda_wheel` for bazel build command." + + " If you absolutely need to build links directly against the NVSHMEM libraries," + + " `provide --@local_config_nvshmem//:override_include_nvshmem_libs=true`.") executable = ctx.executable.wheel_binary full_wheel_version = (TF_VERSION + TF_WHEEL_VERSION_SUFFIX) @@ -147,6 +154,8 @@ tf_wheel = rule( ), "include_cuda_libs": attr.label(default = Label("@local_config_cuda//cuda:include_cuda_libs")), "override_include_cuda_libs": attr.label(default = Label("@local_config_cuda//cuda:override_include_cuda_libs")), + "include_nvshmem_libs": attr.label(default = Label("@local_config_nvshmem//:include_nvshmem_libs")), + "override_include_nvshmem_libs": attr.label(default = Label("@local_config_nvshmem//:override_include_nvshmem_libs")), "platform_tag": attr.string(mandatory = True), "platform_name": attr.string(mandatory = True), }, diff --git a/third_party/xla/WORKSPACE b/third_party/xla/WORKSPACE index fb250a66dac..2ed3b4fb5c2 100644 --- a/third_party/xla/WORKSPACE +++ b/third_party/xla/WORKSPACE @@ -99,3 +99,30 @@ load( ) nccl_configure(name = "local_config_nccl") + +load( + "//third_party/nvshmem/hermetic:nvshmem_json_init_repository.bzl", + "nvshmem_json_init_repository", +) + +nvshmem_json_init_repository() + +load( + "@nvshmem_redist_json//:distributions.bzl", + "NVSHMEM_REDISTRIBUTIONS", +) +load( + "//third_party/nvshmem/hermetic:nvshmem_redist_init_repository.bzl", + "nvshmem_redist_init_repository", +) + +nvshmem_redist_init_repository( + nvshmem_redistributions = NVSHMEM_REDISTRIBUTIONS, +) + +load( + "@local_xla//third_party/nvshmem/hermetic:nvshmem_configure.bzl", + "nvshmem_configure", +) + +nvshmem_configure(name = "local_config_nvshmem") diff --git a/third_party/xla/tensorflow.bazelrc b/third_party/xla/tensorflow.bazelrc index 5e42e49dcab..c1611786c8e 100644 --- a/third_party/xla/tensorflow.bazelrc +++ b/third_party/xla/tensorflow.bazelrc @@ -162,9 +162,10 @@ build:mkl_aarch64 -c opt build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true build:mkl_aarch64_threadpool -c opt -# Default CUDA and CUDNN versions. +# Default CUDA, CUDNN and NVSHMEM versions. build:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.6.3" build:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.3.0" +build:cuda_version --repo_env=HERMETIC_NVSHMEM_VERSION="3.2.5" # CUDA: This config refers to building CUDA op kernels with nvcc. build:cuda --repo_env TF_NEED_CUDA=1 @@ -176,6 +177,7 @@ build:cuda --@local_config_cuda//cuda:include_cuda_libs=true # This configuration is used for building the wheels. build:cuda_wheel --@local_config_cuda//cuda:include_cuda_libs=false +build:cuda_wheel --@local_config_nvshmem//:include_nvshmem_libs=false # CUDA: This config refers to building CUDA op kernels with clang. build:cuda_clang --config=cuda diff --git a/third_party/xla/third_party/nvshmem/hermetic/nvshmem_configure.bzl b/third_party/xla/third_party/nvshmem/hermetic/nvshmem_configure.bzl index 9b506e04a74..94f6631edb6 100644 --- a/third_party/xla/third_party/nvshmem/hermetic/nvshmem_configure.bzl +++ b/third_party/xla/third_party/nvshmem/hermetic/nvshmem_configure.bzl @@ -26,30 +26,64 @@ load( ) NVSHMEM_ENABLED_BUILD_CONTENT = """ +load("@bazel_skylib//lib:selects.bzl", "selects") load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting") + +# This set of flags and config_settings is needed to enable NVSHMEM dependencies +# separately from CUDA dependencies. The reason is that NVSHMEM libraries +# require GLIBC 2.28 and above, which we don't have on RBE runners yet. +# TODO(ybaturina): Remove this once GLIBC 2.28 is available on RBE. bool_flag( name = "include_nvshmem_libs", build_setting_default = False, + visibility = ["//visibility:public"], ) config_setting( name = "nvshmem_libs", flag_values = {":include_nvshmem_libs": "True"}, + visibility = ["//visibility:private"], ) -bool_setting( - name = "true_setting", - visibility = ["//visibility:private"], - build_setting_default = True, +bool_flag( + name = "override_include_nvshmem_libs", + build_setting_default = False, + visibility = ["//visibility:public"], ) config_setting( + name = "overrided_nvshmem_libs", + flag_values = {":true_setting": "False"}, + visibility = ["//visibility:private"], +) + +alias( name = "nvshmem_tools", - flag_values = {":true_setting": "True"}, + actual = "@local_config_cuda//:is_cuda_enabled", + visibility = ["//visibility:public"], +) + +selects.config_setting_group( + name = "any_nvshmem_libs", + match_any = [ + ":nvshmem_libs", + ":overrided_nvshmem_libs", + ], + visibility = ["//visibility:private"], +) + +selects.config_setting_group( + name = "nvshmem_tools_and_libs", + match_all = [ + ":any_nvshmem_libs", + ":nvshmem_tools", + ], + visibility = ["//visibility:public"], ) """ NVSHMEM_DISABLED_BUILD_CONTENT = """ +load("@bazel_skylib//lib:selects.bzl", "selects") load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting") bool_setting( @@ -58,14 +92,52 @@ bool_setting( build_setting_default = True, ) +bool_flag( + name = "include_nvshmem_libs", + build_setting_default = False, + visibility = ["//visibility:public"], +) + config_setting( name = "nvshmem_tools", flag_values = {":true_setting": "False"}, + visibility = ["//visibility:public"], ) config_setting( name = "nvshmem_libs", flag_values = {":true_setting": "False"}, + visibility = ["//visibility:private"], +) + +bool_flag( + name = "override_include_nvshmem_libs", + build_setting_default = False, + visibility = ["//visibility:public"], +) + +config_setting( + name = "overrided_nvshmem_libs", + flag_values = {":true_setting": "False"}, + visibility = ["//visibility:private"], +) + +selects.config_setting_group( + name = "any_nvshmem_libs", + match_any = [ + ":nvshmem_libs", + ":overrided_nvshmem_libs" + ], + visibility = ["//visibility:private"], +) + +selects.config_setting_group( + name = "nvshmem_tools_and_libs", + match_all = [ + ":any_nvshmem_libs", + ":nvshmem_tools" + ], + visibility = ["//visibility:public"], ) """ diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD index 9084719ee2e..6de20dbf882 100644 --- a/third_party/xla/xla/BUILD +++ b/third_party/xla/xla/BUILD @@ -1454,6 +1454,7 @@ bzl_library( "@bazel_skylib//lib:paths", "//xla/tsl:package_groups_bzl", "//xla/tsl:tsl_bzl", + "//xla/tsl:tsl_default_bzl", "//xla/tsl/platform/default:cuda_build_defs_bzl", ], ) diff --git a/third_party/xla/xla/lit.bzl b/third_party/xla/xla/lit.bzl index 22c519dedd9..3a7340e5ab9 100644 --- a/third_party/xla/xla/lit.bzl +++ b/third_party/xla/xla/lit.bzl @@ -3,7 +3,8 @@ load("@bazel_skylib//lib:paths.bzl", "paths") load("@rules_python//python:defs.bzl", "py_binary") load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY") -load("//xla/tsl:tsl.bzl", "if_cuda_tools", "if_google", "if_oss") +load("//xla/tsl:tsl.bzl", "if_google", "if_oss") +load("//xla/tsl:tsl.default.bzl", "if_cuda_tools") load("//xla/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured") visibility(DEFAULT_LOAD_VISIBILITY) diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD index 3fd17916fcc..c5676515b3b 100644 --- a/third_party/xla/xla/stream_executor/cuda/BUILD +++ b/third_party/xla/xla/stream_executor/cuda/BUILD @@ -14,14 +14,19 @@ load( load("//xla/tests:build_defs.bzl", "xla_test") load( "//xla/tsl:tsl.bzl", - # copybara:comment_begin - "if_cuda_tools", - # copybara:comment_end "if_google", "if_windows", "internal_visibility", "tsl_copts", ) + +# copybara:comment_begin +load( + "//xla/tsl:tsl.default.bzl", + "if_cuda_tools", +) + +# copybara:comment_end load("//xla/tsl/platform:build_config.bzl", "tf_proto_library") load( "//xla/tsl/platform:build_config_root.bzl", diff --git a/third_party/xla/xla/tsl/BUILD b/third_party/xla/xla/tsl/BUILD index a230f93e9e5..28f6a3cf48d 100644 --- a/third_party/xla/xla/tsl/BUILD +++ b/third_party/xla/xla/tsl/BUILD @@ -553,6 +553,16 @@ bzl_library( visibility = ["//xla:__subpackages__"], ) +bzl_library( + name = "tsl_default_bzl", + srcs = if_oss(["tsl.default.bzl"]), + visibility = ["//visibility:public"], + deps = [ + ":package_groups_bzl", + ":tsl_bzl", + ], +) + # copybara:comment_begin(oss-only) cc_library( name = "grpc++", diff --git a/third_party/xla/xla/tsl/cuda/BUILD.bazel b/third_party/xla/xla/tsl/cuda/BUILD.bazel index fdcc3413e2c..b6f70cdbe5a 100644 --- a/third_party/xla/xla/tsl/cuda/BUILD.bazel +++ b/third_party/xla/xla/tsl/cuda/BUILD.bazel @@ -11,7 +11,7 @@ load( "if_cuda_is_configured", ) load( - "//xla/tsl:tsl.bzl", + "//xla/tsl:tsl.default.bzl", "if_cuda_libs", ) load("//xla/tsl/cuda:stub.bzl", "cuda_stub") diff --git a/third_party/xla/xla/tsl/platform/default/BUILD b/third_party/xla/xla/tsl/platform/default/BUILD index 98a7f14631a..acf419ba854 100644 --- a/third_party/xla/xla/tsl/platform/default/BUILD +++ b/third_party/xla/xla/tsl/platform/default/BUILD @@ -3,14 +3,18 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library") load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured") load( "//xla/tsl:tsl.bzl", - "if_cuda_tools", "if_not_fuchsia", "if_not_windows", "if_oss", "internal_visibility", "tsl_copts", ) -load("//xla/tsl:tsl.default.bzl", "filegroup", "tsl_grpc_cc_dependencies") +load( + "//xla/tsl:tsl.default.bzl", + "filegroup", + "if_cuda_tools", + "tsl_grpc_cc_dependencies", +) load("//xla/tsl/platform:rules_cc.bzl", "cc_library") package( diff --git a/third_party/xla/xla/tsl/tsl.bzl b/third_party/xla/xla/tsl/tsl.bzl index 3fec637a595..b0f80338fbd 100644 --- a/third_party/xla/xla/tsl/tsl.bzl +++ b/third_party/xla/xla/tsl/tsl.bzl @@ -233,16 +233,6 @@ def if_with_tpu_support(if_true, if_false = []): "//conditions:default": if_false, }) -# These configs are used to determine whether we should use CUDA tools and libs in cc_libraries. -# They are intended for the OSS builds only. -def if_cuda_tools(if_true, if_false = []): # buildifier: disable=unused-variable - """Shorthand for select()'ing on whether we're building with hCUDA tools.""" - return select({"@local_config_cuda//cuda:cuda_tools": if_true, "//conditions:default": if_false}) # copybara:comment_replace return if_false - -def if_cuda_libs(if_true, if_false = []): # buildifier: disable=unused-variable - """Shorthand for select()'ing on whether we need to include hermetic CUDA libraries.""" - return select({"@local_config_cuda//cuda:cuda_tools_and_libs": if_true, "//conditions:default": if_false}) # copybara:comment_replace return if_false - def get_win_copts(is_external = False): WINDOWS_COPTS = [ # copybara:uncomment_begin(no MSVC flags in google) diff --git a/third_party/xla/xla/tsl/tsl.default.bzl b/third_party/xla/xla/tsl/tsl.default.bzl index b746c2ca9c4..baa1d5256de 100644 --- a/third_party/xla/xla/tsl/tsl.default.bzl +++ b/third_party/xla/xla/tsl/tsl.default.bzl @@ -27,3 +27,34 @@ tsl_pybind_extension = _tsl_pybind_extension tsl_google_bzl_deps = _tsl_google_bzl_deps tsl_extra_config_settings = _tsl_extra_config_settings tsl_extra_config_settings_targets = _tsl_extra_config_settings_targets + +# These configs are used to determine whether we should use CUDA/NVSHMEM tools and libs in +# cc_libraries. +# They are intended for the OSS builds only. +def if_cuda_tools(if_true, if_false = []): # buildifier: disable=unused-variable + """Shorthand for select()'ing on whether we're building with hermetic CUDA tools.""" + return select({ + "@local_config_cuda//cuda:cuda_tools": if_true, + "//conditions:default": if_false, + }) + +def if_cuda_libs(if_true, if_false = []): # buildifier: disable=unused-variable + """Shorthand for select()'ing on whether we need to include hermetic CUDA libraries.""" + return select({ + "@local_config_cuda//cuda:cuda_tools_and_libs": if_true, + "//conditions:default": if_false, + }) + +def if_nvshmem_tools(if_true, if_false = []): # buildifier: disable=unused-variable + """Shorthand for select()'ing on whether we're building with hermetic NVSHMEM tools.""" + return select({ + "@local_config_nvshmem//:nvshmem_tools": if_true, + "//conditions:default": if_false, + }) + +def if_nvshmem_libs(if_true, if_false = []): # buildifier: disable=unused-variable + """Shorthand for select()'ing on whether we need to include hermetic NVSHMEM libraries.""" + return select({ + "@local_config_nvshmem//:nvshmem_tools_and_libs": if_true, + "//conditions:default": if_false, + })