diff --git a/.bazelrc b/.bazelrc index 17285afdb38..17410912425 100644 --- a/.bazelrc +++ b/.bazelrc @@ -67,6 +67,7 @@ build:sycl_trisycl --define=using_sycl=true --define=using_trisycl=true build:gdr --define=with_gdr_support=true build:ngraph --define=with_ngraph_support=true build:verbs --define=with_verbs_support=true +build:numa --define=with_numa_support=true # Options to disable default on features build:noaws --define=no_aws_support=true diff --git a/configure.py b/configure.py index 3eb09a1ae90..673825c0ad2 100644 --- a/configure.py +++ b/configure.py @@ -1751,6 +1751,7 @@ def main(): config_info_line('gdr', 'Build with GDR support.') config_info_line('verbs', 'Build with libverbs support.') config_info_line('ngraph', 'Build with Intel nGraph support.') + config_info_line('numa', 'Build with NUMA support.') config_info_line( 'dynamic_kernels', '(Experimental) Build kernels into separate shared objects.') diff --git a/tensorflow/BUILD b/tensorflow/BUILD index f53982f1efc..e1d988ac654 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -304,6 +304,12 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "with_numa_support", + define_values = {"with_numa_support": "true"}, + visibility = ["//visibility:public"], +) + # Crosses between framework_shared_object and a bunch of other configurations # due to limitations in nested select() statements. config_setting( diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 64aed375b28..8f5de683220 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -128,6 +128,9 @@ load( "tf_additional_libdevice_srcs", "tf_additional_minimal_lib_srcs", "tf_additional_mpi_lib_defines", + "tf_additional_numa_deps", + "tf_additional_numa_lib_defines", + "tf_additional_numa_copts", "tf_additional_proto_hdrs", "tf_additional_proto_srcs", "tf_additional_test_deps", @@ -388,15 +391,15 @@ cc_library( ":platform_port_hdrs", ":platform_port_internal_hdrs", ], - copts = tf_copts(), + copts = tf_copts() + tf_additional_numa_copts(), visibility = ["//tensorflow/core:__subpackages__"], deps = [ ":lib_platform", ":platform_base", - "//tensorflow/core/platform/default/build_config:port", "@com_google_absl//absl/base", + "//tensorflow/core/platform/default/build_config:port", "@snappy", - ], + ] + tf_additional_numa_deps(), ) filegroup( @@ -2278,11 +2281,14 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [ ] # Replicated for lib_internal and lib_internal_impl. -LIB_INTERNAL_DEFINES = (tf_additional_lib_defines() + [ - "TF_USE_SNAPPY", - ] + tf_additional_verbs_lib_defines() + - tf_additional_mpi_lib_defines() + - tf_additional_gdr_lib_defines()) +LIB_INTERNAL_DEFINES = ( + tf_additional_lib_defines() + [ + "TF_USE_SNAPPY", + ] + tf_additional_verbs_lib_defines() + + tf_additional_mpi_lib_defines() + + tf_additional_gdr_lib_defines() + + tf_additional_numa_lib_defines() +) cc_library( name = "lib_internal", @@ -2355,19 +2361,20 @@ cc_library( copts = tf_copts(), defines = LIB_INTERNAL_DEFINES, deps = tf_additional_lib_deps() + [ - ":lib_hash_crc32c_accelerate_internal", - ":lib_proto_parsing", - ":abi", - ":core_stringpiece", - "@com_google_absl//absl/memory", - "@com_google_absl//absl/strings", - "//third_party/eigen3", - "//tensorflow/core/platform/default/build_config:platformlib", - "@snappy", - "@zlib_archive//:zlib", - "@double_conversion//:double-conversion", - "@protobuf_archive//:protobuf", - ] + tf_protos_all_impl() + tf_protos_grappler_impl(), + ":lib_hash_crc32c_accelerate_internal", + ":lib_proto_parsing", + ":abi", + ":core_stringpiece", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "//third_party/eigen3", + "//tensorflow/core/platform/default/build_config:platformlib", + "@snappy", + "@zlib_archive//:zlib", + "@double_conversion//:double-conversion", + "@protobuf_archive//:protobuf", + ] + tf_protos_all_impl() + tf_protos_grappler_impl() + + tf_additional_numa_deps(), ) # File compiled with extra flags to get cpu-specific acceleration. diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl index f9ac4ff0bca..f6f449a95d2 100644 --- a/tensorflow/core/platform/default/build_config.bzl +++ b/tensorflow/core/platform/default/build_config.bzl @@ -725,6 +725,12 @@ def tf_additional_gdr_lib_defines(): "//conditions:default": [], }) +def tf_additional_numa_lib_defines(): + return select({ + "//tensorflow:with_numa_support": ["TENSORFLOW_USE_NUMA"], + "//conditions:default": [], + }) + def tf_py_clif_cc(name, visibility = None, **kwargs): pass @@ -757,3 +763,26 @@ def tf_additional_binary_deps(): "//third_party/mkl:intel_binary_blob", ], ) + +def tf_additional_numa_deps(): + return select({ + "//tensorflow:android": [], + "//tensorflow:ios": [], + "//tensorflow:windows": [], + "//tensorflow:darwin": [], + "//conditions:default": [ + "@hwloc", + ], + }) + +def tf_additional_numa_copts(): + return select({ + "//tensorflow:android": [], + "//tensorflow:ios": [], + "//tensorflow:windows": [], + "//tensorflow:darwin": [], + "//conditions:default": [ + "-Ithird_party/hwloc/hwloc-master/include", + "-DTENSORFLOW_USE_NUMA", + ], + }) diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc index 807e0083229..1561632a49a 100644 --- a/tensorflow/core/platform/posix/port.cc +++ b/tensorflow/core/platform/posix/port.cc @@ -45,6 +45,10 @@ limitations under the License. #include #endif +#if TENSORFLOW_USE_NUMA +#include "hwloc.h" // TF:hwloc +#endif + namespace tensorflow { namespace port { @@ -115,16 +119,94 @@ int NumHyperthreadsPerCore() { return (ht_per_core > 0) ? ht_per_core : 1; } -bool NUMAEnabled() { - // Not yet implemented: coming soon. - return false; +#ifdef TENSORFLOW_USE_NUMA +namespace { +static hwloc_topology_t hwloc_topology_handle; + +bool HaveHWLocTopology() { + // One time initialization + static bool init = []() { + if (hwloc_topology_init(&hwloc_topology_handle)) { + LOG(ERROR) << "Call to hwloc_topology_init() failed"; + return false; + } + if (hwloc_topology_load(hwloc_topology_handle)) { + LOG(ERROR) << "Call to hwloc_topology_load() failed"; + return false; + } + return true; + }(); + return init; } -int NUMANumNodes() { return 1; } +// Return the first hwloc object of the given type whose os_index +// matches 'index'. +hwloc_obj_t GetHWLocTypeIndex(hwloc_obj_type_t tp, int index) { + hwloc_obj_t obj = nullptr; + if (index >= 0) { + while ((obj = hwloc_get_next_obj_by_type(hwloc_topology_handle, tp, obj)) != + nullptr) { + if (obj->os_index == index) break; + } + } + return obj; +} +} // namespace +#endif // TENSORFLOW_USE_NUMA -void NUMASetThreadNodeAffinity(int node) {} +bool NUMAEnabled() { return (NUMANumNodes() > 1); } -int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; } +int NUMANumNodes() { +#ifdef TENSORFLOW_USE_NUMA + if (HaveHWLocTopology()) { + int num_numanodes = + hwloc_get_nbobjs_by_type(hwloc_topology_handle, HWLOC_OBJ_NUMANODE); + return std::max(1, num_numanodes); + } else { + return 1; + } +#else + return 1; +#endif // TENSORFLOW_USE_NUMA +} + +void NUMASetThreadNodeAffinity(int node) { +#ifdef TENSORFLOW_USE_NUMA + if (HaveHWLocTopology()) { + // Find the corresponding NUMA node topology object. + hwloc_obj_t obj = GetHWLocTypeIndex(HWLOC_OBJ_NUMANODE, node); + if (obj) { + hwloc_set_cpubind(hwloc_topology_handle, obj->cpuset, + HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT); + } else { + LOG(ERROR) << "Could not find hwloc NUMA node " << node; + } + } +#endif // TENSORFLOW_USE_NUMA +} + +int NUMAGetThreadNodeAffinity() { + int node_index = kNUMANoAffinity; +#ifdef TENSORFLOW_USE_NUMA + if (HaveHWLocTopology()) { + hwloc_cpuset_t thread_cpuset = hwloc_bitmap_alloc(); + hwloc_get_cpubind(hwloc_topology_handle, thread_cpuset, + HWLOC_CPUBIND_THREAD); + hwloc_obj_t obj = nullptr; + // Return the first NUMA node whose cpuset is a (non-proper) superset of + // that of the current thread. + while ((obj = hwloc_get_next_obj_by_type( + hwloc_topology_handle, HWLOC_OBJ_NUMANODE, obj)) != nullptr) { + if (hwloc_bitmap_isincluded(thread_cpuset, obj->cpuset)) { + node_index = obj->os_index; + break; + } + } + hwloc_bitmap_free(thread_cpuset); + } +#endif // TENSORFLOW_USE_NUMA + return node_index; +} void* AlignedMalloc(size_t size, int minimum_alignment) { #if defined(__ANDROID__) @@ -154,12 +236,54 @@ void* Realloc(void* ptr, size_t size) { return realloc(ptr, size); } void Free(void* ptr) { free(ptr); } void* NUMAMalloc(int node, size_t size, int minimum_alignment) { +#ifdef TENSORFLOW_USE_NUMA + if (HaveHWLocTopology()) { + hwloc_obj_t numa_node = GetHWLocTypeIndex(HWLOC_OBJ_NUMANODE, node); + if (numa_node) { + return hwloc_alloc_membind(hwloc_topology_handle, size, + numa_node->nodeset, HWLOC_MEMBIND_BIND, + HWLOC_MEMBIND_BYNODESET); + } else { + LOG(ERROR) << "Failed to find hwloc NUMA node " << node; + } + } +#endif // TENSORFLOW_USE_NUMA return AlignedMalloc(size, minimum_alignment); } -void NUMAFree(void* ptr, size_t size) { Free(ptr); } +void NUMAFree(void* ptr, size_t size) { +#ifdef TENSORFLOW_USE_NUMA + if (HaveHWLocTopology()) { + hwloc_free(hwloc_topology_handle, ptr, size); + return; + } +#endif // TENSORFLOW_USE_NUMA + Free(ptr); +} -int NUMAGetMemAffinity(const void* addr) { return kNUMANoAffinity; } +int NUMAGetMemAffinity(const void* addr) { + int node = kNUMANoAffinity; +#ifdef TENSORFLOW_USE_NUMA + if (HaveHWLocTopology() && addr) { + hwloc_nodeset_t nodeset = hwloc_bitmap_alloc(); + if (!hwloc_get_area_memlocation(hwloc_topology_handle, addr, 4, nodeset, + HWLOC_MEMBIND_BYNODESET)) { + hwloc_obj_t obj = nullptr; + while ((obj = hwloc_get_next_obj_by_type( + hwloc_topology_handle, HWLOC_OBJ_NUMANODE, obj)) != nullptr) { + if (hwloc_bitmap_isincluded(nodeset, obj->nodeset)) { + node = obj->os_index; + break; + } + } + hwloc_bitmap_free(nodeset); + } else { + LOG(ERROR) << "Failed call to hwloc_get_area_memlocation."; + } + } +#endif // TENSORFLOW_USE_NUMA + return node; +} void MallocExtension_ReleaseToSystem(std::size_t num_bytes) { // No-op. diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD index 0c81ebeefb5..525c05b369e 100644 --- a/tensorflow/tools/lib_package/BUILD +++ b/tensorflow/tools/lib_package/BUILD @@ -130,6 +130,7 @@ genrule( "@gemmlowp//:LICENSE", "@gif_archive//:COPYING", "@highwayhash//:LICENSE", + "@hwloc//:LICENSE", "@icu//:icu4c/LICENSE", "@jpeg//:LICENSE.md", "@llvm//:LICENSE.TXT", @@ -199,6 +200,7 @@ genrule( "@gemmlowp//:LICENSE", "@gif_archive//:COPYING", "@highwayhash//:LICENSE", + "@hwloc//:LICENSE", "@icu//:icu4j/main/shared/licenses/LICENSE", "@jpeg//:LICENSE.md", "@llvm//:LICENSE.TXT", diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index 90dfca2b444..88f13a051e9 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -171,6 +171,7 @@ filegroup( "@gemmlowp//:LICENSE", "@gif_archive//:COPYING", "@highwayhash//:LICENSE", + "@hwloc//:LICENSE", "@icu//:icu4c/LICENSE", "@jpeg//:LICENSE.md", "@keras_applications_archive//:LICENSE",