2023-12-14 08:40:20 +08:00
|
|
|
# Owner(s): ["module: inductor"]
|
2024-10-02 09:10:44 -07:00
|
|
|
import itertools
|
2023-12-14 08:40:20 +08:00
|
|
|
import sys
|
|
|
|
|
import unittest
|
|
|
|
|
from typing import NamedTuple
|
|
|
|
|
|
2024-04-09 10:56:26 -07:00
|
|
|
import torch
|
2023-12-14 08:40:20 +08:00
|
|
|
from torch._inductor import config
|
2024-03-13 12:15:48 -07:00
|
|
|
from torch._inductor.test_case import TestCase as InductorTestCase
|
2025-03-04 02:50:36 +00:00
|
|
|
from torch.testing._internal.common_utils import slowTest
|
2025-03-11 20:17:09 -07:00
|
|
|
from torch.testing._internal.inductor_utils import GPU_TYPE, RUN_GPU
|
2023-12-14 08:40:20 +08:00
|
|
|
|
|
|
|
|
|
2025-11-18 05:28:35 +00:00
|
|
|
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
|
|
|
|
|
2023-12-14 08:40:20 +08:00
|
|
|
try:
|
|
|
|
|
try:
|
|
|
|
|
from . import (
|
2024-08-30 19:58:18 +00:00
|
|
|
test_combo_kernels,
|
2023-12-14 08:40:20 +08:00
|
|
|
test_foreach,
|
|
|
|
|
test_pattern_matcher,
|
|
|
|
|
test_select_algorithm,
|
|
|
|
|
test_torchinductor,
|
|
|
|
|
test_torchinductor_dynamic_shapes,
|
|
|
|
|
)
|
|
|
|
|
except ImportError:
|
2024-09-24 05:06:12 +00:00
|
|
|
import test_combo_kernels # @manual=fbcode//caffe2/test/inductor:combo_kernels-library
|
2024-08-30 19:58:18 +00:00
|
|
|
|
2024-09-13 02:04:34 +00:00
|
|
|
import test_foreach # @manual=fbcode//caffe2/test/inductor:foreach-library
|
|
|
|
|
import test_pattern_matcher # @manual=fbcode//caffe2/test/inductor:pattern_matcher-library
|
|
|
|
|
import test_select_algorithm # @manual=fbcode//caffe2/test/inductor:select_algorithm-library
|
|
|
|
|
import test_torchinductor # @manual=fbcode//caffe2/test/inductor:test_inductor-library
|
|
|
|
|
import test_torchinductor_dynamic_shapes # @manual=fbcode//caffe2/test/inductor:test_inductor-library_dynamic_shapes
|
2023-12-14 08:40:20 +08:00
|
|
|
except unittest.SkipTest:
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
sys.exit(0)
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
2024-11-25 21:41:28 -08:00
|
|
|
class GpuWrapperTemplate:
|
2023-12-14 08:40:20 +08:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
2024-11-25 21:41:28 -08:00
|
|
|
class TestGpuWrapper(InductorTestCase):
|
|
|
|
|
device = GPU_TYPE
|
2023-12-14 08:40:20 +08:00
|
|
|
|
2025-02-10 22:24:26 +00:00
|
|
|
def test_aoti_debug_printer_works_on_constants(self):
|
|
|
|
|
batch_size = 32
|
|
|
|
|
seq_length = 50
|
|
|
|
|
hidden_size = 768
|
|
|
|
|
|
|
|
|
|
def test_fn():
|
|
|
|
|
inp = torch.randn(batch_size, seq_length, hidden_size, device=self.device)
|
|
|
|
|
weight = torch.randn(hidden_size, hidden_size, device=self.device)
|
|
|
|
|
matmul_output = inp @ weight
|
|
|
|
|
torch.nn.LayerNorm(hidden_size, device=self.device)(matmul_output)
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
comp = torch.compile(
|
|
|
|
|
options={
|
|
|
|
|
"cpp_wrapper": True,
|
|
|
|
|
"aot_inductor.debug_intermediate_value_printer": "2",
|
|
|
|
|
}
|
|
|
|
|
)(test_fn)
|
|
|
|
|
comp()
|
|
|
|
|
|
2025-09-24 23:40:34 +00:00
|
|
|
def test_non_tensor_args_wrapped_on_cpu(self):
|
|
|
|
|
if not RUN_GPU:
|
|
|
|
|
self.skipTest("GPU not available")
|
|
|
|
|
|
|
|
|
|
def test_fn(x, s):
|
|
|
|
|
return (x + s).sum()
|
|
|
|
|
|
|
|
|
|
compiled = torch.compile(options={"cpp_wrapper": True})(test_fn)
|
|
|
|
|
x = torch.randn(4, device=self.device)
|
|
|
|
|
with torch.utils._device.DeviceContext(self.device):
|
|
|
|
|
_, code = test_torchinductor.run_and_get_cpp_code(compiled, x, 3)
|
|
|
|
|
self.assertIn("torch.tensor(arg, device='cpu')", code)
|
|
|
|
|
|
2023-12-14 08:40:20 +08:00
|
|
|
|
2024-11-25 21:41:28 -08:00
|
|
|
class DynamicShapesGpuWrapperGpuTests(InductorTestCase):
|
|
|
|
|
device = GPU_TYPE
|
2023-12-14 08:40:20 +08:00
|
|
|
|
2025-01-30 18:34:22 +00:00
|
|
|
def test_annotation_training(self):
|
|
|
|
|
batch_size = 32
|
|
|
|
|
seq_length = 50
|
|
|
|
|
hidden_size = 768
|
|
|
|
|
|
|
|
|
|
def create_test_fn():
|
|
|
|
|
def test_fn():
|
|
|
|
|
inp = torch.randn(
|
|
|
|
|
batch_size, seq_length, hidden_size, device=self.device
|
|
|
|
|
)
|
|
|
|
|
weight = torch.randn(hidden_size, hidden_size, device=self.device)
|
|
|
|
|
matmul_output = inp @ weight
|
|
|
|
|
torch.nn.LayerNorm(hidden_size, device=self.device)(matmul_output)
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
return test_fn
|
|
|
|
|
|
|
|
|
|
fn = torch.compile(options={"annotate_training": True, "cpp_wrapper": True})(
|
|
|
|
|
create_test_fn()
|
|
|
|
|
)
|
|
|
|
|
fn()
|
|
|
|
|
|
2023-12-14 08:40:20 +08:00
|
|
|
|
2024-11-25 21:41:28 -08:00
|
|
|
test_failures_gpu_wrapper = {
|
2024-12-30 08:48:50 -08:00
|
|
|
"test_mm_plus_mm2_dynamic_shapes": test_torchinductor.TestFailure(
|
2024-11-25 21:41:28 -08:00
|
|
|
("gpu_wrapper",), is_skip=True
|
|
|
|
|
),
|
|
|
|
|
"test_randint_xpu": test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=False),
|
|
|
|
|
"test_randint_xpu_dynamic_shapes": test_torchinductor.TestFailure(
|
|
|
|
|
("gpu_wrapper",), is_skip=False
|
|
|
|
|
),
|
|
|
|
|
# ATen ops: scaled_dot_product_efficient_attention not implemented on XPU.
|
|
|
|
|
"test_scaled_dot_product_efficient_attention_xpu": test_torchinductor.TestFailure(
|
|
|
|
|
("gpu_wrapper",), is_skip=False
|
|
|
|
|
),
|
|
|
|
|
"test_scaled_dot_product_efficient_attention_xpu_dynamic_shapes": test_torchinductor.TestFailure(
|
|
|
|
|
("gpu_wrapper",), is_skip=False
|
2023-12-14 08:40:20 +08:00
|
|
|
),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_test_case(
|
|
|
|
|
name,
|
|
|
|
|
device,
|
|
|
|
|
tests,
|
|
|
|
|
condition=True,
|
|
|
|
|
slow=False,
|
|
|
|
|
func_inputs=None,
|
|
|
|
|
code_string_count=None,
|
2024-10-02 09:10:44 -07:00
|
|
|
check_code=True,
|
2023-12-14 08:40:20 +08:00
|
|
|
):
|
|
|
|
|
test_name = f"{name}_{device}" if device else name
|
|
|
|
|
if code_string_count is None:
|
|
|
|
|
code_string_count = {}
|
|
|
|
|
|
|
|
|
|
func = getattr(tests, test_name)
|
|
|
|
|
assert callable(func), "not a callable"
|
|
|
|
|
func = slowTest(func) if slow else func
|
|
|
|
|
|
2025-06-26 17:29:59 +00:00
|
|
|
@config.patch(cpp_wrapper=True)
|
2023-12-14 08:40:20 +08:00
|
|
|
def fn(self):
|
|
|
|
|
tests.setUpClass()
|
|
|
|
|
tests.setUp()
|
|
|
|
|
try:
|
2024-04-09 10:56:26 -07:00
|
|
|
with torch._C._PreserveDispatchKeyGuard():
|
|
|
|
|
torch._C._dispatch_tls_set_dispatch_key_included(
|
|
|
|
|
torch._C.DispatchKey.Dense, True
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
_, code = test_torchinductor.run_and_get_cpp_code(
|
|
|
|
|
func, *func_inputs if func_inputs else []
|
|
|
|
|
)
|
2024-10-02 09:10:44 -07:00
|
|
|
if check_code:
|
|
|
|
|
self.assertEqual("CppWrapperCodeCache" in code, True)
|
|
|
|
|
self.assertTrue(
|
|
|
|
|
all(
|
|
|
|
|
code.count(string) == code_string_count[string]
|
|
|
|
|
for string in code_string_count
|
|
|
|
|
)
|
2024-04-09 10:56:26 -07:00
|
|
|
)
|
2023-12-14 08:40:20 +08:00
|
|
|
finally:
|
|
|
|
|
tests.tearDown()
|
|
|
|
|
tests.tearDownClass()
|
|
|
|
|
|
|
|
|
|
fn.__name__ = test_name
|
|
|
|
|
import copy
|
|
|
|
|
|
|
|
|
|
fn.__dict__ = copy.deepcopy(func.__dict__)
|
|
|
|
|
if condition:
|
|
|
|
|
setattr(
|
2024-11-25 21:41:28 -08:00
|
|
|
GpuWrapperTemplate,
|
2023-12-14 08:40:20 +08:00
|
|
|
test_name,
|
|
|
|
|
fn,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2024-11-25 21:41:28 -08:00
|
|
|
if RUN_GPU:
|
2023-12-14 08:40:20 +08:00
|
|
|
|
|
|
|
|
class BaseTest(NamedTuple):
|
|
|
|
|
name: str
|
2024-11-25 21:41:28 -08:00
|
|
|
device: str = GPU_TYPE
|
2024-03-13 12:15:48 -07:00
|
|
|
tests: InductorTestCase = test_torchinductor.GPUTests()
|
2024-10-02 09:10:44 -07:00
|
|
|
check_code: bool = True
|
2023-12-14 08:40:20 +08:00
|
|
|
|
2024-11-25 21:41:28 -08:00
|
|
|
# XPU Not implemented yet
|
|
|
|
|
XPU_BASE_TEST_SKIP = [
|
|
|
|
|
"test_dynamic_shapes_persistent_reduction_mixed_x_dim",
|
|
|
|
|
]
|
|
|
|
|
|
2023-12-14 08:40:20 +08:00
|
|
|
# Maintain two separate test lists for cuda and cpp for now
|
|
|
|
|
for item in [
|
2024-05-02 08:09:31 -07:00
|
|
|
BaseTest("test_add_complex"),
|
2024-01-31 19:34:55 +00:00
|
|
|
BaseTest("test_add_complex4"),
|
2023-12-14 08:40:20 +08:00
|
|
|
BaseTest("test_as_strided"), # buffer reuse
|
|
|
|
|
BaseTest("test_batch_norm_2d_2"),
|
2025-12-04 10:30:52 -08:00
|
|
|
BaseTest("test_bernoulli1_combo_kernels_False"),
|
|
|
|
|
BaseTest("test_bernoulli1_combo_kernels_True"),
|
2023-12-14 08:40:20 +08:00
|
|
|
BaseTest("test_bitwise"), # int32
|
|
|
|
|
BaseTest("test_bmm1"),
|
|
|
|
|
BaseTest("test_bmm2"),
|
2024-05-10 08:08:55 -07:00
|
|
|
BaseTest("test_buffer_use_after_remove"),
|
2023-12-14 08:40:20 +08:00
|
|
|
BaseTest("test_cat"), # alias
|
|
|
|
|
BaseTest("test_convolution1"),
|
|
|
|
|
BaseTest("test_conv_backward"),
|
2024-03-25 08:41:38 -07:00
|
|
|
BaseTest("test_custom_op_1"),
|
|
|
|
|
BaseTest("test_custom_op_2"),
|
|
|
|
|
BaseTest("test_custom_op_3"),
|
2023-12-14 08:40:20 +08:00
|
|
|
BaseTest("test_embedding_bag"), # test default FallbackKernel
|
|
|
|
|
BaseTest("test_index_put_deterministic_fallback"),
|
|
|
|
|
BaseTest("test_adding_tensor_offsets"),
|
|
|
|
|
BaseTest("test_index_tensor"),
|
2024-04-05 11:05:59 -07:00
|
|
|
BaseTest("test_inductor_layout_optimization_input_mutations"),
|
2024-05-24 07:21:27 -07:00
|
|
|
BaseTest("test_insignificant_strides"),
|
2023-12-26 16:17:05 +00:00
|
|
|
BaseTest("test_layer_norm"),
|
2023-12-14 08:40:20 +08:00
|
|
|
BaseTest("test_linear1"),
|
|
|
|
|
BaseTest("test_linear2"),
|
|
|
|
|
BaseTest("test_mm_views"),
|
|
|
|
|
BaseTest("test_multi_device"),
|
|
|
|
|
BaseTest("test_multi_threading"),
|
2024-05-21 05:56:08 -07:00
|
|
|
BaseTest("test_pow3"),
|
2023-12-14 08:40:20 +08:00
|
|
|
BaseTest("test_profiler_mark_wrapper_call"),
|
2024-05-10 14:06:58 -07:00
|
|
|
BaseTest("test_randint"),
|
2023-12-14 08:40:20 +08:00
|
|
|
BaseTest("test_reduction1"), # Reduction
|
|
|
|
|
BaseTest("test_relu"), # multiple inputs
|
|
|
|
|
BaseTest("test_repeat_interleave_2"),
|
2024-05-30 12:45:27 -07:00
|
|
|
BaseTest("test_roi_align"),
|
2023-12-14 08:40:20 +08:00
|
|
|
BaseTest("test_scalar_input"),
|
|
|
|
|
BaseTest("test_scaled_dot_product_attention"),
|
|
|
|
|
BaseTest("test_scaled_dot_product_efficient_attention"),
|
|
|
|
|
BaseTest("test_sort"),
|
|
|
|
|
BaseTest("test_silu"), # single input, single output
|
|
|
|
|
BaseTest("test_sum_dtype"), # float64
|
|
|
|
|
BaseTest("test_sum_int"), # bool, int64, int8, uint8
|
|
|
|
|
BaseTest("test_transpose"), # multiple outputs, buffer clear
|
2024-10-02 09:10:44 -07:00
|
|
|
*[
|
|
|
|
|
BaseTest(f"test_unspec_inputs_{str(dtype)[6:]}")
|
|
|
|
|
for dtype in test_torchinductor.test_dtypes
|
|
|
|
|
],
|
2024-10-16 20:41:07 +00:00
|
|
|
BaseTest("test_consecutive_split_cumprod"),
|
2024-08-01 22:43:26 +00:00
|
|
|
BaseTest("test_pointwise_hermite_polynomial_he"),
|
|
|
|
|
BaseTest("test_pointwise_hermite_polynomial_h"),
|
2023-12-14 08:40:20 +08:00
|
|
|
BaseTest(
|
|
|
|
|
"test_foreach_cpp_wrapper",
|
|
|
|
|
tests=test_foreach.ForeachTests(),
|
|
|
|
|
), # test foreach
|
2024-08-30 19:58:18 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_enable_dynamic_shapes_cpp_wrapper",
|
|
|
|
|
tests=test_foreach.ForeachTests(),
|
|
|
|
|
),
|
|
|
|
|
BaseTest(
|
|
|
|
|
"test_dynamic_shapes_persistent_reduction_mixed_x_dim",
|
|
|
|
|
tests=test_combo_kernels.ComboKernelDynamicShapesTests(),
|
|
|
|
|
),
|
2023-12-14 08:40:20 +08:00
|
|
|
BaseTest(
|
|
|
|
|
"test_cat_slice_cat",
|
|
|
|
|
tests=test_pattern_matcher.TestPatternMatcher(),
|
|
|
|
|
),
|
|
|
|
|
# TODO: Re-enable this test after fixing cuda wrapper for conv Triton templates with dynamic shapes.
|
|
|
|
|
# This test is unstable: it succeeds when an ATEN kernel is used, and fails when a Triton kernel is used.
|
|
|
|
|
# Currently it passes on CI (an ATEN kernel is chosen) and fails locally (a Triton kernel is chosen).
|
|
|
|
|
# Ideally, it should succeed for whatever kernels.
|
|
|
|
|
# BaseTest(
|
|
|
|
|
# "test_convolution1",
|
|
|
|
|
# device=None,
|
|
|
|
|
# tests=test_select_algorithm.TestSelectAlgorithm(),
|
|
|
|
|
# ),
|
2024-07-25 06:43:18 -07:00
|
|
|
BaseTest(
|
|
|
|
|
"test_mm_plus_mm2",
|
2024-12-30 08:48:50 -08:00
|
|
|
device=None,
|
2024-07-25 06:43:18 -07:00
|
|
|
tests=test_select_algorithm.TestSelectAlgorithm(),
|
|
|
|
|
),
|
|
|
|
|
BaseTest(
|
|
|
|
|
"test_mm_plus_mm3",
|
2024-12-30 08:48:50 -08:00
|
|
|
device=None,
|
2024-07-25 06:43:18 -07:00
|
|
|
tests=test_select_algorithm.TestSelectAlgorithm(),
|
|
|
|
|
),
|
2023-12-14 08:40:20 +08:00
|
|
|
BaseTest("test_fft_real_input"),
|
|
|
|
|
BaseTest("test_fft_real_input_real_output"),
|
2024-10-02 09:10:44 -07:00
|
|
|
*[
|
|
|
|
|
# some dtypes may raise exception and be skipped in test_dtypeview, so set check_code to False here
|
|
|
|
|
BaseTest(
|
|
|
|
|
f"test_dtypeview_{str(dtype_x)[6:]}_{str(dtype_y)[6:]}",
|
|
|
|
|
check_code=False,
|
|
|
|
|
)
|
|
|
|
|
for dtype_x, dtype_y in itertools.product(
|
|
|
|
|
test_torchinductor.test_dtypes, test_torchinductor.test_dtypes
|
|
|
|
|
)
|
|
|
|
|
],
|
[inductor]Add DtypeView to avoid memory leak and unnecessary kernel generations (#128883)
Fixes #126338
## Issue Summary
When torchinductor compiles the combination `functional_collective -> view.dtype -> wait`, a memory leak occurs. This happens because `view.dtype` is compiled into an out-of-place Triton kernel that copies the input data to a new tensor, even if the data hasn't completed collection via the wait operation. The tensor used by `collective` is only freed when the `wait` operation triggers the garbage collector, see [~WorkRegistry](https://github.com/pytorch/pytorch/blob/main/torch/csrc/distributed/c10d/Functional.cpp#L41). However, since `wait` now waits for a new tensor, the previous one is never freed. The `view.dtype` should only check the metadata instead of creating a new tensor. The current lowering is against its semantics and causes memory leaks.
See more great discussions in the #126338
This kind of lowering also generates unnecessary triton kernels for `view.dtype` when it can't be fused with other operations.
## Fix
The function `aten.view.dtype` is a CPU operation that changes the metadata of its input. After discussions with @eellison and @bdhirsh, we decided to change the lowering of `aten.view.dtype` to ensure it fallback properly to the correct `aten.view.dtype` instead of generating a Triton kernel in some cases. This approach also preserves the same semantics of the view operation.
When the model calls `aten.view.dtype` with a data type whose bit width matches the input's original data type, we lower it to the newly added `DtypeView` in IR, acting like a `ReinterpretView`. When the operation can be fused, its `make_loader` is called to maintain the correct type conversion for each load instruction. When the operation can't be fused, it falls back to `aten.view.dtype` to avoid Triton kernel generation.
## Example
```python
@torch.compile
def fn(x, y):
x = x.view(torch.float16)
y = y.view(torch.float16) + 1
return x @ y
x = torch.randn((2, 2), device=self.device, dtype=torch.bfloat16)
y = torch.randn((2, 2), device=self.device, dtype=torch.bfloat16)
fn(x, y)
```
The output code generated before this fix is like the following.
```python
triton_poi_fused_add_view_0...
def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.bfloat16).to(tl.float32, bitcast=True).to(tl.float32)
tl.store(out_ptr0 + (x0), tmp1, xmask)
triton_poi_fused_add_view_1...
def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.bfloat16).to(tl.float32, bitcast=True).to(tl.float32)
tmp2 = 1.0
tmp3 = tmp1 + tmp2
tl.store(out_ptr0 + (x0), tmp3, xmask)
def call(args):
...
triton_poi_fused_view_0.run(arg0_1, buf0, 4, grid=grid(4), stream=stream0)
del arg0_1
buf1 = empty_strided_cuda((2, 2), (2, 1), torch.float16)
# Source Nodes: [view_1, y], Original ATen: [aten.add, aten.view]
triton_poi_fused_add_view_1.run(arg1_1, buf1, 4, grid=grid(4), stream=stream0)
del arg1_1
buf2 = empty_strided_cuda((2, 2), (2, 1), torch.float16)
# Source Nodes: [matmul, view_1, x, y], Original ATen: [aten.add, aten.mm, aten.view]
extern_kernels.mm(buf0, buf1, out=buf2)
```
As you can see, the two `view` operations are compiled to two kernels `triton_poi_fused_view_0` nad `triton_poi_fused_add_view_1`. Both of them has a line `tmp1 = tmp0.to(tl.bfloat16).to(tl.float32, bitcast=True).to(tl.float32)` which does the type conversion.
The main issue is that the first `view` operation didn't do anything to the actual data. But it generates a triton kernel with a new output tensor. Another small issue is that this triton kernel can't be compiled because `bitcast=True` only support type converstion with same bidwidth.
The following are output code generated after this PR.
```python
triton_poi_fused_add_0...
def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
xnumel = 4
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
tmp1 = tmp0.to(tl.bfloat16).to(tl.float32)
tmp2 = 1.0
tmp3 = tmp1 + tmp2
tl.store(out_ptr0 + (x0), tmp3, xmask)
def call(args):
...
triton_poi_fused_add_0.run(arg1_1, buf0, 4, grid=grid(4), stream=stream0)
del arg1_1
buf1 = empty_strided_cuda((2, 2), (2, 1), torch.float16)
# Source Nodes: [matmul, y], Original ATen: [aten.add, aten.mm]
extern_kernels.mm(aten.view.dtype(arg0_1, torch.float16), buf0, out=buf1)
```
The first `view` operation has been replaced with the `aten.view.dtype` and it is directly passed as an argument. The second one is still there because it is fused with the following add operation. The invalid bitcast operation is removed too.
The following two code snippets is for the upcasts and downcasts. For dtype in `torch.float16, torch.bfloat16`, each load will be upcasted to float32, then downcast to its original dtype to ensure use values with the right precision.
https://github.com/pytorch/pytorch/blob/7bda23ef8495a22bde8b7861f5d4f3f015cee483/torch/_inductor/codegen/triton.py#L1725-L1726
https://github.com/pytorch/pytorch/blob/7bda23ef8495a22bde8b7861f5d4f3f015cee483/torch/_inductor/codegen/triton.py#L629-L642
Huge thanks to @eellison, @bdhirsh, @shunting314, and @desertfire .
Pull Request resolved: https://github.com/pytorch/pytorch/pull/128883
Approved by: https://github.com/eellison
2024-07-23 17:31:37 +00:00
|
|
|
BaseTest("test_dtypeview_fusion"),
|
2024-10-01 03:03:25 +00:00
|
|
|
# skip if not enough SMs
|
|
|
|
|
BaseTest(
|
|
|
|
|
"test_addmm",
|
2024-12-30 08:48:50 -08:00
|
|
|
device=None,
|
2024-10-01 03:03:25 +00:00
|
|
|
tests=test_select_algorithm.TestSelectAlgorithm(),
|
|
|
|
|
),
|
|
|
|
|
# skip if not enough SMs
|
|
|
|
|
BaseTest(
|
|
|
|
|
"test_linear_relu",
|
2024-12-30 08:48:50 -08:00
|
|
|
device=None,
|
2024-10-01 03:03:25 +00:00
|
|
|
tests=test_select_algorithm.TestSelectAlgorithm(),
|
|
|
|
|
),
|
2023-12-14 08:40:20 +08:00
|
|
|
]:
|
2024-11-25 21:41:28 -08:00
|
|
|
if item.device == "xpu" and item.name in XPU_BASE_TEST_SKIP:
|
|
|
|
|
continue
|
2024-10-02 09:10:44 -07:00
|
|
|
make_test_case(item.name, item.device, item.tests, check_code=item.check_code)
|
2023-12-14 08:40:20 +08:00
|
|
|
|
add is_big_gpu(0) check to test_select_algorithm tests in tests/inductor/test_cuda_cpp_wrapper.py (#128652)
In NVIDIA internal CI, on Jetson devices we are seeing this failure for `python test/inductor/test_cuda_cpp_wrapper.py -k test_addmm_cuda_cuda_wrapper -k test_linear_relu_cuda_cuda_wrapper`:
```
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py:132: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.
warnings.warn(
W0613 20:57:17.722000 281473279256672 torch/_inductor/utils.py:902] [0/0] Not enough SMs to use max_autotune_gemm mode
frames [('total', 1), ('ok', 1)]
stats [('calls_captured', 2), ('unique_graphs', 1)]
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1)]
aot_autograd [('total', 1), ('ok', 1)]
F
======================================================================
FAIL: test_linear_relu_cuda_cuda_wrapper (__main__.TestCudaWrapper)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_utils.py", line 2759, in wrapper
method(*args, **kwargs)
File "/opt/pytorch/pytorch/test/inductor/test_torchinductor.py", line 9818, in new_test
return value(self)
File "/usr/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/opt/pytorch/pytorch/test/inductor/test_cuda_cpp_wrapper.py", line 152, in fn
_, code = test_torchinductor.run_and_get_cpp_code(
File "/opt/pytorch/pytorch/test/inductor/test_torchinductor.py", line 356, in run_and_get_cpp_code
result = fn(*args, **kwargs)
File "/opt/pytorch/pytorch/test/inductor/test_select_algorithm.py", line 43, in wrapped
return fn(*args, **kwargs)
File "/usr/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/usr/lib/python3.10/unittest/mock.py", line 1379, in patched
return func(*newargs, **newkeywargs)
File "/usr/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/usr/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/opt/pytorch/pytorch/test/inductor/test_select_algorithm.py", line 62, in test_linear_relu_cuda
self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_utils.py", line 3642, in assertEqual
raise error_metas.pop()[0].to_error(
AssertionError: Scalars are not equal!
Expected 1 but got 0.
Absolute difference: 1
Relative difference: 1.0
```
Looking into it, we see the failure is from https://github.com/pytorch/pytorch/blob/main/test/inductor/test_select_algorithm.py#L62. The warning `W0613 20:57:17.722000 281473279256672 torch/_inductor/utils.py:902] [0/0] Not enough SMs to use max_autotune_gemm ` is triggered from https://github.com/pytorch/pytorch/blob/main/torch/_inductor/utils.py#L973. Printing torch.cuda.get_device_properties(0).multi_processor_count returns 16 on the computelab AGX Orin; thus it makes sense that this check is failing, since the min_required_sms is 68, thus not letting it pick the autotune algorithm. Looking at the main for test_select_algorithm.py, we see that these tests should only be run if is_big_gpu(0) is true: https://github.com/pytorch/pytorch/blob/main/test/inductor/test_select_algorithm.py#L344. Thus this PR adds a similar check to the invocation of these tests in test_cuda_cpp_wrapper.py.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/128652
Approved by: https://github.com/soulitzer, https://github.com/eqy
2024-06-18 02:00:01 +00:00
|
|
|
from torch._inductor.utils import is_big_gpu
|
|
|
|
|
|
2025-11-18 05:28:35 +00:00
|
|
|
if GPU_TYPE in ("cuda", "xpu") and is_big_gpu():
|
2024-10-01 03:03:25 +00:00
|
|
|
skip_list = ["test_addmm", "test_linear_relu"]
|
|
|
|
|
# need to skip instead of omit, otherwise fbcode ci can be flaky
|
|
|
|
|
for test_name in skip_list:
|
2025-11-18 05:28:35 +00:00
|
|
|
test_failures_gpu_wrapper[f"{test_name}_{device_type}"] = (
|
2025-06-24 17:24:15 +08:00
|
|
|
test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
|
|
|
|
|
)
|
|
|
|
|
test_failures_gpu_wrapper[f"{test_name}_gpu_dynamic_shapes"] = (
|
|
|
|
|
test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
|
|
|
|
|
)
|
add is_big_gpu(0) check to test_select_algorithm tests in tests/inductor/test_cuda_cpp_wrapper.py (#128652)
In NVIDIA internal CI, on Jetson devices we are seeing this failure for `python test/inductor/test_cuda_cpp_wrapper.py -k test_addmm_cuda_cuda_wrapper -k test_linear_relu_cuda_cuda_wrapper`:
```
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py:132: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.
warnings.warn(
W0613 20:57:17.722000 281473279256672 torch/_inductor/utils.py:902] [0/0] Not enough SMs to use max_autotune_gemm mode
frames [('total', 1), ('ok', 1)]
stats [('calls_captured', 2), ('unique_graphs', 1)]
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1)]
aot_autograd [('total', 1), ('ok', 1)]
F
======================================================================
FAIL: test_linear_relu_cuda_cuda_wrapper (__main__.TestCudaWrapper)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_utils.py", line 2759, in wrapper
method(*args, **kwargs)
File "/opt/pytorch/pytorch/test/inductor/test_torchinductor.py", line 9818, in new_test
return value(self)
File "/usr/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/opt/pytorch/pytorch/test/inductor/test_cuda_cpp_wrapper.py", line 152, in fn
_, code = test_torchinductor.run_and_get_cpp_code(
File "/opt/pytorch/pytorch/test/inductor/test_torchinductor.py", line 356, in run_and_get_cpp_code
result = fn(*args, **kwargs)
File "/opt/pytorch/pytorch/test/inductor/test_select_algorithm.py", line 43, in wrapped
return fn(*args, **kwargs)
File "/usr/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/usr/lib/python3.10/unittest/mock.py", line 1379, in patched
return func(*newargs, **newkeywargs)
File "/usr/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/usr/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/opt/pytorch/pytorch/test/inductor/test_select_algorithm.py", line 62, in test_linear_relu_cuda
self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
File "/usr/local/lib/python3.10/dist-packages/torch/testing/_internal/common_utils.py", line 3642, in assertEqual
raise error_metas.pop()[0].to_error(
AssertionError: Scalars are not equal!
Expected 1 but got 0.
Absolute difference: 1
Relative difference: 1.0
```
Looking into it, we see the failure is from https://github.com/pytorch/pytorch/blob/main/test/inductor/test_select_algorithm.py#L62. The warning `W0613 20:57:17.722000 281473279256672 torch/_inductor/utils.py:902] [0/0] Not enough SMs to use max_autotune_gemm ` is triggered from https://github.com/pytorch/pytorch/blob/main/torch/_inductor/utils.py#L973. Printing torch.cuda.get_device_properties(0).multi_processor_count returns 16 on the computelab AGX Orin; thus it makes sense that this check is failing, since the min_required_sms is 68, thus not letting it pick the autotune algorithm. Looking at the main for test_select_algorithm.py, we see that these tests should only be run if is_big_gpu(0) is true: https://github.com/pytorch/pytorch/blob/main/test/inductor/test_select_algorithm.py#L344. Thus this PR adds a similar check to the invocation of these tests in test_cuda_cpp_wrapper.py.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/128652
Approved by: https://github.com/soulitzer, https://github.com/eqy
2024-06-18 02:00:01 +00:00
|
|
|
|
2023-12-14 08:40:20 +08:00
|
|
|
test_torchinductor.copy_tests(
|
2024-11-25 21:41:28 -08:00
|
|
|
GpuWrapperTemplate, TestGpuWrapper, "gpu_wrapper", test_failures_gpu_wrapper
|
2023-12-14 08:40:20 +08:00
|
|
|
)
|
|
|
|
|
|
2024-11-25 21:41:28 -08:00
|
|
|
DynamicShapesGpuWrapperTemplate = (
|
|
|
|
|
test_torchinductor_dynamic_shapes.make_dynamic_cls(GpuWrapperTemplate)
|
2023-12-14 08:40:20 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
test_torchinductor.copy_tests(
|
2024-11-25 21:41:28 -08:00
|
|
|
DynamicShapesGpuWrapperTemplate,
|
|
|
|
|
DynamicShapesGpuWrapperGpuTests,
|
|
|
|
|
"gpu_wrapper",
|
|
|
|
|
test_failures_gpu_wrapper,
|
2024-07-25 06:43:18 -07:00
|
|
|
xfail_prop="_expected_failure_dynamic_wrapper",
|
2023-12-14 08:40:20 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-03-13 12:15:48 -07:00
|
|
|
from torch._inductor.test_case import run_tests
|
2023-12-14 08:40:20 +08:00
|
|
|
|
2024-11-25 21:41:28 -08:00
|
|
|
if RUN_GPU:
|
2023-12-14 08:40:20 +08:00
|
|
|
run_tests(needs="filelock")
|