[torchbench][optimus] Add backend optimus (#167357)

Summary: `--optimus [all | vertical_opt | horizontal_opt]` will kick off inductor compile with different fusion strategies. Test Plan: TorchBench Runner: ``` $ buck2 run mode/opt //pytorch/benchmark:run -- customized_optimus_illustrative -t train -d cuda GPU Time per batch: 56.254 milliseconds CPU Wall Time per batch: 56.326 milliseconds CPU Wall Time: 56.326 milliseconds Time to first batch: 420.0777 ms GPU 0 Peak Memory: 0.0695 GB CPU Peak Memory: 359.6362 GB ``` PT2 Benchmark Runner (comparing with eager): ``` buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --inductor running benchmark: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:02<00:00, 14.37it/s] 4.509x ``` eager latency: ~56 ms inductor latency: ~11 ms Optimus backend: ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --optimus all 11.02923508733511 ms, 13.884015614166856 ms, 0.794x ``` ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --optimus vertical_opt 12.47156853787601 ms, 10.699485195800662 ms, 1.166x ``` ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --optimus horizontal_opt 11.078484123572707 ms, 10.797873372212052 ms, 1.026x ``` optimus latency ~10 ms Differential Revision: D86524903 Pull Request resolved: https://github.com/pytorch/pytorch/pull/167357 Approved by: https://github.com/mengluy0125
2026-01-15 12:15:51 +00:00 · 2025-11-11 00:35:27 +00:00
parent f6331192b4
commit 8d5cceeb6a
2 changed files with 96 additions and 5 deletions
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -952,7 +952,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
        first_fields.append(kwargs["tag"])
    headers = first_headers + ["speedup", "abs_latency"]
    row = first_fields + [float(speedup), median[1] * 1000]
-    msg = f"{speedup:.3f}x"
+    msg = f"{median[0] * 1000} ms, {median[1] * 1000} ms, {speedup:.3f}x"
    if args.baseline:
        headers.extend(
            [
@@ -1010,7 +1010,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
    # Hypothetically you can use this from other places, but it's currently
    # inaccessible, and when this assert fails you need to update the
    # event_name here to account for the other cases you are using this
-    assert args.quantization is not None
+    assert any([args.quantization, args.optimus])
    output_signpost(
        dict(zip(headers, row)),
        args,
@@ -2587,6 +2587,9 @@ class BenchmarkRunner:
                **experiment_kwargs,
            )

+            # reset dynamo
+            torch._dynamo.reset()
+
            if self.args.export_aot_inductor:
                optimized_model_iter_fn = optimize_ctx
            else:
@@ -2950,7 +2953,7 @@ class BenchmarkRunner:
            status = self.check_tolerance(name, model, example_inputs, optimize_ctx)
            print(status)
        elif self.args.performance:
-            if self.args.backend == "torchao":
+            if self.args.backend in ["torchao", "optimus"]:
                status = self.run_performance_test_non_alternate(
                    name, model, example_inputs, optimize_ctx, experiment, tag
                )
@@ -3526,6 +3529,12 @@ def parse_args(args=None):
        action="store_true",
        help="Measure speedup with TorchInductor",
    )
+    group.add_argument(
+        "--optimus",
+        choices=["vertical_opt", "horizontal_opt", "all"],
+        default=None,
+        help="Measure speedup of Optimus with TorchInductor baseline",
+    )
    group.add_argument(
        "--quantization",
        choices=[
@@ -3783,6 +3792,9 @@ def run(runner, args, original_dir=None):
    if args.inductor:
        assert args.backend is None
        args.backend = "inductor"
+    if args.optimus:
+        assert args.backend is None
+        args.backend = "optimus"
    if args.quantization:
        assert args.backend is None
        args.backend = "torchao"
@@ -4067,10 +4079,22 @@ def run(runner, args, original_dir=None):

            runner.model_iter_fn = model_iter_fn_and_mark_step
            optimize_ctx = torchao_optimize_ctx(args.quantization)
+        elif args.backend == "optimus":
+            from .optimus import get_baseline_ctx, get_optimus_optimize_ctx
+
+            baseline_ctx = get_baseline_ctx(
+                nopython=args.nopython, inductor_compile_mode=args.inductor_compile_mode
+            )
+            runner.model_iter_fn = baseline_ctx(runner.model_iter_fn)
+            optimize_ctx = get_optimus_optimize_ctx(
+                args.optimus, args.nopython, args.inductor_compile_mode
+            )
        else:
            optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
        experiment = (
-            speedup_experiment if args.backend != "torchao" else latency_experiment
+            speedup_experiment
+            if args.backend not in ["torchao", "optimus"]
+            else latency_experiment
        )
        if args.accuracy:
            output_filename = f"accuracy_{args.backend}.csv"
@@ -4091,7 +4115,12 @@ def run(runner, args, original_dir=None):
    if args.only in runner.disable_cudagraph_models:
        args.disable_cudagraphs = True

-    if args.inductor or args.backend == "inductor" or args.export_aot_inductor:
+    if (
+        args.inductor
+        or args.backend == "inductor"
+        or args.export_aot_inductor
+        or args.backend == "optimus"
+    ):
        inductor_config.triton.cudagraphs = not args.disable_cudagraphs
        inductor_config.triton.persistent_reductions = (
            not args.disable_persistent_reductions
--- a/benchmarks/dynamo/optimus.py
+++ b/benchmarks/dynamo/optimus.py
@@ -0,0 +1,62 @@
+import functools
+
+import torch
+
+
+def get_baseline_ctx(nopython, inductor_compile_mode):
+    return functools.partial(
+        torch.compile,
+        backend="inductor",
+        fullgraph=nopython,
+        mode=inductor_compile_mode,
+    )
+
+
+def get_optimus_optimize_ctx(config, nopython, inductor_compile_mode):
+    if config == "vertical_opt":
+        optimus_inductor_config = {
+            "pre_grad_fusion_options": {
+                "normalization_pass": {},
+                "merge_splits_pass": {},
+                "split_cat_pass": {},
+                "unbind_stack_pass": {},
+                "unbind_cat_to_view_pass": {},
+            }
+        }
+    elif config == "horizontal_opt":
+        optimus_inductor_config = {
+            "pre_grad_fusion_options": {
+                "normalization_pass": {},
+                "batch_linear": {},
+                "batch_layernorm": {},
+            },
+        }
+    elif config == "all":
+        optimus_inductor_config = {
+            "pre_grad_fusion_options": {
+                "normalization_pass": {},
+                "batch_linear": {},
+                "batch_layernorm": {},
+                "merge_splits_pass": {},
+                "split_cat_pass": {},
+                "unbind_stack_pass": {},
+                "unbind_cat_to_view_pass": {},
+            },
+        }
+    else:
+        raise RuntimeError(f"Unknown optimus config: {config}")
+
+    def _inner(fn):
+        if "pre_grad_fusion_options" in optimus_inductor_config:
+            torch._inductor.config.pre_grad_fusion_options = optimus_inductor_config[
+                "pre_grad_fusion_options"
+            ]
+        if "post_grad_fusion_options" in optimus_inductor_config:
+            torch._inductor.config.post_grad_fusion_options = optimus_inductor_config[
+                "post_grad_fusion_options"
+            ]
+        return torch.compile(
+            fn, backend="inductor", fullgraph=nopython, mode=inductor_compile_mode
+        )
+
+    return _inner