From 359e02ef2047e0178c9e1978c8fa294e6782c56f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 4 Apr 2019 13:00:30 -0700 Subject: [PATCH] [XLA:CPU] Adds HLO-level tracing. PiperOrigin-RevId: 241989294 --- tensorflow/compiler/xla/service/cpu/BUILD | 2 + .../compiler/xla/service/cpu/cpu_runtime.cc | 23 ++++++ .../compiler/xla/service/cpu/cpu_runtime.h | 10 +++ .../compiler/xla/service/cpu/ir_emitter.cc | 72 +++++++++++++++++++ .../compiler/xla/service/cpu/ir_emitter.h | 16 +++++ .../xla/service/cpu/simple_orc_jit.cc | 2 + 6 files changed, 125 insertions(+) diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index e5f51fd7ed4..445265d6178 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -253,6 +253,7 @@ cc_library( "//tensorflow/compiler/xla/service:tuple_points_to_analysis", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/core/profiler/lib:traceme", "//tensorflow/stream_executor/host:host_stream", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", @@ -508,6 +509,7 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:lib", + "//tensorflow/core/profiler/lib:traceme", "//tensorflow/stream_executor", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/synchronization", diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc index d8878e622c0..b09fead49b7 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/profiler/lib/traceme.h" #include "tensorflow/stream_executor/stream_executor.h" namespace xla { @@ -86,7 +87,11 @@ extern const char* const kParallelForkJoinSymbolName = "__xla_cpu_runtime_ParallelForkJoin"; extern const char* const kKeyValueSortSymbolName = "__xla_cpu_runtime_KeyValueSort"; +extern const char* const kTracingStartSymbolName = + "__xla_cpu_runtime_TracingStart"; +extern const char* const kTracingEndSymbolName = "__xla_cpu_runtime_TracingEnd"; extern const char* const kXlaCpuRuntimeSymbolNamePrefix = "__xla_cpu_runtime_"; + } // namespace runtime } // namespace cpu } // namespace xla @@ -104,6 +109,24 @@ tensorflow::string ShapeString(const void* shape_ptr, xla::int32 shape_length) { } // namespace +extern "C" { + +TF_ATTRIBUTE_NO_SANITIZE_MEMORY xla::int64 __xla_cpu_runtime_TracingStart( + const void* /* xla::ExecutableRunOptions* */ run_options_ptr, + const char* name) { + VLOG(3) << "TracingStart " << name; + return tensorflow::profiler::TraceMe::ActivityStart(name); +} + +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_TracingEnd( + const void* /* xla::ExecutableRunOptions* */ run_options_ptr, + xla::int64 id) { + VLOG(3) << "TracingEnd " << id; + tensorflow::profiler::TraceMe::ActivityEnd(id); +} + +} // extern "C" + TF_ATTRIBUTE_NO_SANITIZE_MEMORY void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue( const xla::ExecutableRunOptions* run_options, xla::int32 buffer_length, diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h index 3a2b44d8c1a..684cb92c217 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h @@ -66,6 +66,9 @@ extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName; extern const char* const kParallelForkJoinSymbolName; extern const char* const kKeyValueSortSymbolName; +extern const char* const kTracingStartSymbolName; +extern const char* const kTracingEndSymbolName; + // All symbol names for XLA CPU runtime functions need to start with this // prefix. extern const char* const kXlaCpuRuntimeSymbolNamePrefix; @@ -80,6 +83,13 @@ XfeedManager* GetXfeedManager(int device_ordinal); extern "C" { +extern xla::int64 __xla_cpu_runtime_TracingStart( + const void* /* xla::ExecutableRunOptions* */ run_options_ptr, + const char* name); +extern void __xla_cpu_runtime_TracingEnd( + const void* /* xla::ExecutableRunOptions* */ run_options_ptr, + xla::int64 id); + // Some things common to all of the runtime entry points below: // // * The shape pointer and shape_length reflect values that can be deserialized diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index 63b0500003a..06ea62d552c 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -176,6 +176,13 @@ StatusOr IrEmitter::EmitComputation( bool use_rdtscp = arch_type_ == llvm::Triple::ArchType::x86 || arch_type_ == llvm::Triple::ArchType::x86_64; profiling_state_ = ProfilingState(use_rdtscp); + + bool emit_tracing = + hlo_module_config_.hlo_profiling_enabled() && + hlo_module_config_.debug_options().xla_backend_extra_options().count( + "xla_hlo_trace"); + tracing_state_.set_enabled(emit_tracing); + TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, instruction_order)); llvm::Function* ir_function = compute_function_->function(); InsertOrDie(&emitted_functions_, computation, ir_function); @@ -2883,9 +2890,70 @@ void IrEmitter::ProfilingState::RecordCompleteComputation( } } +void IrEmitter::TracingState::EmitTracingStart(llvm::IRBuilder<>* b, + HloInstruction* hlo, + llvm::Value* run_options) { + if (!enabled_) { + return; + } + + llvm::Type* int8_ptr_type = b->getInt8Ty()->getPointerTo(); + llvm::Type* void_ptr_type = b->getVoidTy()->getPointerTo(); + llvm::FunctionType* fn_type = + llvm::FunctionType::get(b->getInt64Ty(), {void_ptr_type, int8_ptr_type}, + /*isVarArg=*/false); + + llvm::Function* function = b->GetInsertBlock()->getParent(); + llvm::Module* module = function->getParent(); + const char* fn_name = runtime::kTracingStartSymbolName; + llvm::FunctionCallee trace_func = + module->getOrInsertFunction(fn_name, fn_type); + if (auto* fn = llvm::dyn_cast(trace_func.getCallee())) { + fn->setCallingConv(llvm::CallingConv::C); + fn->setDoesNotThrow(); + fn->setOnlyAccessesArgMemory(); + } + auto* hlo_name = b->CreateGlobalStringPtr(hlo->name()); + auto* activity_id = + b->CreateCall(trace_func, {b->CreateBitCast(run_options, void_ptr_type), + b->CreateBitCast(hlo_name, int8_ptr_type)}); + activity_id->setName(IrName(hlo, "activity_id")); + activity_ids_[hlo] = activity_id; +} + +void IrEmitter::TracingState::EmitTracingEnd(llvm::IRBuilder<>* b, + HloInstruction* hlo, + llvm::Value* run_options) { + if (!enabled_) { + return; + } + + llvm::Type* void_ptr_type = b->getVoidTy()->getPointerTo(); + llvm::FunctionType* fn_type = + llvm::FunctionType::get(b->getVoidTy(), {void_ptr_type, b->getInt64Ty()}, + /*isVarArg=*/false); + + llvm::Function* function = b->GetInsertBlock()->getParent(); + llvm::Module* module = function->getParent(); + const char* fn_name = runtime::kTracingEndSymbolName; + llvm::FunctionCallee trace_func = + module->getOrInsertFunction(fn_name, fn_type); + if (auto* fn = llvm::dyn_cast(trace_func.getCallee())) { + fn->setCallingConv(llvm::CallingConv::C); + fn->setDoesNotThrow(); + fn->setOnlyAccessesArgMemory(); + } + auto* activity_id = activity_ids_.at(hlo); + b->CreateCall(trace_func, + {b->CreateBitCast(run_options, void_ptr_type), activity_id}); +} + Status IrEmitter::Preprocess(HloInstruction* hlo) { VLOG(3) << "Visiting: " << hlo->ToString(); if (instruction_to_profile_idx_.count(hlo)) { + // Only trace the same HLOs that the profiler does. + tracing_state_.EmitTracingStart(&b_, hlo, + GetExecutableRunOptionsArgument()); profiling_state_.RecordCycleStart(&b_, hlo); } return Status::OK(); @@ -2895,6 +2963,10 @@ Status IrEmitter::Postprocess(HloInstruction* hlo) { if (auto* prof_counter = GetProfileCounterFor(*hlo)) { profiling_state_.RecordCycleDelta(&b_, hlo, prof_counter); } + // Only trace the same HLOs that the profiler does. + if (instruction_to_profile_idx_.count(hlo)) { + tracing_state_.EmitTracingEnd(&b_, hlo, GetExecutableRunOptionsArgument()); + } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h index 2cd624d70d5..44d660fbb1f 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h @@ -531,6 +531,22 @@ class IrEmitter : public DfsHloVisitorWithDefault, ProfilingState profiling_state_; + class TracingState { + public: + TracingState() : enabled_(false) {} + void set_enabled(bool value) { enabled_ = value; } + void EmitTracingStart(llvm::IRBuilder<>* b, HloInstruction* hlo, + llvm::Value* run_options); + void EmitTracingEnd(llvm::IRBuilder<>* b, HloInstruction* hlo, + llvm::Value* run_options); + + private: + bool enabled_; + // Maps from HLO to the activity id returned by xprof::TraceMe. + std::unordered_map activity_ids_; + }; + TracingState tracing_state_; + // Given a load instruction and a shape or buffer size, annotate the load's // result with the alignment required by the shape or size. void AttachAlignmentMetadataForLoad(llvm::LoadInst* load, const Shape& shape); diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc index a86edd8a181..02b2c32f402 100644 --- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc +++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc @@ -245,6 +245,8 @@ bool RegisterKnownJITSymbols() { REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue); REGISTER_CPU_RUNTIME_SYMBOL(ReleaseOutfeedBufferAfterPopulation); REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSort); + REGISTER_CPU_RUNTIME_SYMBOL(TracingStart); + REGISTER_CPU_RUNTIME_SYMBOL(TracingEnd); registry->Register("__gnu_f2h_ieee", reinterpret_cast(__gnu_f2h_ieee)); registry->Register("__gnu_h2f_ieee", reinterpret_cast(__gnu_h2f_ieee));