mirror of
https://github.com/zebrajr/pytorch.git
synced 2026-01-15 12:15:51 +00:00
[torchbench][optimus] Add backend optimus (#167357)
Summary: `--optimus [all | vertical_opt | horizontal_opt]` will kick off inductor compile with different fusion strategies. Test Plan: TorchBench Runner: ``` $ buck2 run mode/opt //pytorch/benchmark:run -- customized_optimus_illustrative -t train -d cuda GPU Time per batch: 56.254 milliseconds CPU Wall Time per batch: 56.326 milliseconds CPU Wall Time: 56.326 milliseconds Time to first batch: 420.0777 ms GPU 0 Peak Memory: 0.0695 GB CPU Peak Memory: 359.6362 GB ``` PT2 Benchmark Runner (comparing with eager): ``` buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --inductor running benchmark: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:02<00:00, 14.37it/s] 4.509x ``` eager latency: ~56 ms inductor latency: ~11 ms Optimus backend: ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --optimus all 11.02923508733511 ms, 13.884015614166856 ms, 0.794x ``` ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --optimus vertical_opt 12.47156853787601 ms, 10.699485195800662 ms, 1.166x ``` ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --optimus horizontal_opt 11.078484123572707 ms, 10.797873372212052 ms, 1.026x ``` optimus latency ~10 ms Differential Revision: D86524903 Pull Request resolved: https://github.com/pytorch/pytorch/pull/167357 Approved by: https://github.com/mengluy0125
This commit is contained in:
committed by
PyTorch MergeBot
parent
f6331192b4
commit
8d5cceeb6a
@@ -952,7 +952,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
|
||||
first_fields.append(kwargs["tag"])
|
||||
headers = first_headers + ["speedup", "abs_latency"]
|
||||
row = first_fields + [float(speedup), median[1] * 1000]
|
||||
msg = f"{speedup:.3f}x"
|
||||
msg = f"{median[0] * 1000} ms, {median[1] * 1000} ms, {speedup:.3f}x"
|
||||
if args.baseline:
|
||||
headers.extend(
|
||||
[
|
||||
@@ -1010,7 +1010,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
|
||||
# Hypothetically you can use this from other places, but it's currently
|
||||
# inaccessible, and when this assert fails you need to update the
|
||||
# event_name here to account for the other cases you are using this
|
||||
assert args.quantization is not None
|
||||
assert any([args.quantization, args.optimus])
|
||||
output_signpost(
|
||||
dict(zip(headers, row)),
|
||||
args,
|
||||
@@ -2587,6 +2587,9 @@ class BenchmarkRunner:
|
||||
**experiment_kwargs,
|
||||
)
|
||||
|
||||
# reset dynamo
|
||||
torch._dynamo.reset()
|
||||
|
||||
if self.args.export_aot_inductor:
|
||||
optimized_model_iter_fn = optimize_ctx
|
||||
else:
|
||||
@@ -2950,7 +2953,7 @@ class BenchmarkRunner:
|
||||
status = self.check_tolerance(name, model, example_inputs, optimize_ctx)
|
||||
print(status)
|
||||
elif self.args.performance:
|
||||
if self.args.backend == "torchao":
|
||||
if self.args.backend in ["torchao", "optimus"]:
|
||||
status = self.run_performance_test_non_alternate(
|
||||
name, model, example_inputs, optimize_ctx, experiment, tag
|
||||
)
|
||||
@@ -3526,6 +3529,12 @@ def parse_args(args=None):
|
||||
action="store_true",
|
||||
help="Measure speedup with TorchInductor",
|
||||
)
|
||||
group.add_argument(
|
||||
"--optimus",
|
||||
choices=["vertical_opt", "horizontal_opt", "all"],
|
||||
default=None,
|
||||
help="Measure speedup of Optimus with TorchInductor baseline",
|
||||
)
|
||||
group.add_argument(
|
||||
"--quantization",
|
||||
choices=[
|
||||
@@ -3783,6 +3792,9 @@ def run(runner, args, original_dir=None):
|
||||
if args.inductor:
|
||||
assert args.backend is None
|
||||
args.backend = "inductor"
|
||||
if args.optimus:
|
||||
assert args.backend is None
|
||||
args.backend = "optimus"
|
||||
if args.quantization:
|
||||
assert args.backend is None
|
||||
args.backend = "torchao"
|
||||
@@ -4067,10 +4079,22 @@ def run(runner, args, original_dir=None):
|
||||
|
||||
runner.model_iter_fn = model_iter_fn_and_mark_step
|
||||
optimize_ctx = torchao_optimize_ctx(args.quantization)
|
||||
elif args.backend == "optimus":
|
||||
from .optimus import get_baseline_ctx, get_optimus_optimize_ctx
|
||||
|
||||
baseline_ctx = get_baseline_ctx(
|
||||
nopython=args.nopython, inductor_compile_mode=args.inductor_compile_mode
|
||||
)
|
||||
runner.model_iter_fn = baseline_ctx(runner.model_iter_fn)
|
||||
optimize_ctx = get_optimus_optimize_ctx(
|
||||
args.optimus, args.nopython, args.inductor_compile_mode
|
||||
)
|
||||
else:
|
||||
optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
|
||||
experiment = (
|
||||
speedup_experiment if args.backend != "torchao" else latency_experiment
|
||||
speedup_experiment
|
||||
if args.backend not in ["torchao", "optimus"]
|
||||
else latency_experiment
|
||||
)
|
||||
if args.accuracy:
|
||||
output_filename = f"accuracy_{args.backend}.csv"
|
||||
@@ -4091,7 +4115,12 @@ def run(runner, args, original_dir=None):
|
||||
if args.only in runner.disable_cudagraph_models:
|
||||
args.disable_cudagraphs = True
|
||||
|
||||
if args.inductor or args.backend == "inductor" or args.export_aot_inductor:
|
||||
if (
|
||||
args.inductor
|
||||
or args.backend == "inductor"
|
||||
or args.export_aot_inductor
|
||||
or args.backend == "optimus"
|
||||
):
|
||||
inductor_config.triton.cudagraphs = not args.disable_cudagraphs
|
||||
inductor_config.triton.persistent_reductions = (
|
||||
not args.disable_persistent_reductions
|
||||
|
||||
62
benchmarks/dynamo/optimus.py
Normal file
62
benchmarks/dynamo/optimus.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import functools
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def get_baseline_ctx(nopython, inductor_compile_mode):
|
||||
return functools.partial(
|
||||
torch.compile,
|
||||
backend="inductor",
|
||||
fullgraph=nopython,
|
||||
mode=inductor_compile_mode,
|
||||
)
|
||||
|
||||
|
||||
def get_optimus_optimize_ctx(config, nopython, inductor_compile_mode):
|
||||
if config == "vertical_opt":
|
||||
optimus_inductor_config = {
|
||||
"pre_grad_fusion_options": {
|
||||
"normalization_pass": {},
|
||||
"merge_splits_pass": {},
|
||||
"split_cat_pass": {},
|
||||
"unbind_stack_pass": {},
|
||||
"unbind_cat_to_view_pass": {},
|
||||
}
|
||||
}
|
||||
elif config == "horizontal_opt":
|
||||
optimus_inductor_config = {
|
||||
"pre_grad_fusion_options": {
|
||||
"normalization_pass": {},
|
||||
"batch_linear": {},
|
||||
"batch_layernorm": {},
|
||||
},
|
||||
}
|
||||
elif config == "all":
|
||||
optimus_inductor_config = {
|
||||
"pre_grad_fusion_options": {
|
||||
"normalization_pass": {},
|
||||
"batch_linear": {},
|
||||
"batch_layernorm": {},
|
||||
"merge_splits_pass": {},
|
||||
"split_cat_pass": {},
|
||||
"unbind_stack_pass": {},
|
||||
"unbind_cat_to_view_pass": {},
|
||||
},
|
||||
}
|
||||
else:
|
||||
raise RuntimeError(f"Unknown optimus config: {config}")
|
||||
|
||||
def _inner(fn):
|
||||
if "pre_grad_fusion_options" in optimus_inductor_config:
|
||||
torch._inductor.config.pre_grad_fusion_options = optimus_inductor_config[
|
||||
"pre_grad_fusion_options"
|
||||
]
|
||||
if "post_grad_fusion_options" in optimus_inductor_config:
|
||||
torch._inductor.config.post_grad_fusion_options = optimus_inductor_config[
|
||||
"post_grad_fusion_options"
|
||||
]
|
||||
return torch.compile(
|
||||
fn, backend="inductor", fullgraph=nopython, mode=inductor_compile_mode
|
||||
)
|
||||
|
||||
return _inner
|
||||
Reference in New Issue
Block a user