test/cpp/nativert/test_static_kernel_ops.cpp

#include <fmt/format.h>
#include <gtest/gtest.h>
#include <torch/torch.h>
#include <random>
#include "test/cpp/nativert/static_kernel_test_utils.h" // @manual

namespace torch::nativert {

namespace {
std::vector<c10::IValue> generateArgsForQuantizedEmbeddingBag() {
  // Set seed for reproducibility
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<int> int_dis(0, 15); // num_embeddings - 1
  int num_embeddings = 16;
  int embedding_dim = 32;
  int num_lengths = 10;

  auto weight =
      at::randint(0, 255, {num_embeddings, embedding_dim}).to(at::kByte);

  // Generate random lengths
  std::vector<int> np_lengths(num_lengths);
  for (auto& length : np_lengths) {
    length = int_dis(gen);
  }
  int total_length = 0;
  for (const auto& length : np_lengths) {
    total_length += length;
  }
  // Generate random indices
  at::Tensor indices =
      torch::empty({total_length}, torch::dtype(torch::kInt32));
  auto indices_accessor = indices.accessor<int, 1>();
  for (int i = 0; i < total_length; ++i) {
    indices_accessor[i] = int_dis(gen);
  }
  // Create lengths tensor
  at::Tensor lengths = torch::from_blob(
      np_lengths.data(), {num_lengths}, torch::dtype(torch::kInt32));
  // Calculate offsets
  at::Tensor offsets = torch::cat(
      {torch::zeros({1}, torch::dtype(torch::kInt32)),
       torch::cumsum(lengths, 0)});
  offsets = offsets.to(torch::dtype(torch::kInt32));

  at::Tensor per_sample_weights = at::randn(indices.sizes());

  std::vector<c10::IValue> args{weight, indices, offsets, per_sample_weights};
  return args;
}

std::vector<c10::IValue> generateArgsForEmbeddingBag(bool include_padding_idx) {
  torch::Tensor weight = torch::randn({10, 3}, torch::dtype(torch::kFloat32));
  torch::Tensor indices =
      torch::randint(0, 10, {20}, torch::dtype(torch::kInt64));
  torch::Tensor offsets =
      torch::tensor({0, 5, 10, 15, 20}, torch::dtype(torch::kInt64));
  torch::Tensor per_sample_weights =
      torch::rand({20}, torch::dtype(torch::kFloat32));
  // Define the padding_idx
  int64_t padding_idx = 1;
  // Create a vector of IValues to store the arguments
  std::vector<c10::IValue> args;
  args.emplace_back(weight);
  args.emplace_back(indices);
  args.emplace_back(offsets);
  args.emplace_back(per_sample_weights);
  if (include_padding_idx) {
    args.emplace_back(padding_idx);
  }
  return args;
}
} // namespace

TEST(StaticKernelTest, QuantizedEmbeddingBagByteRowwiseOffsets) {
  const std::string graph =
      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
%out = torch.ops.quantized.embedding_bag_byte_rowwise_offsets.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, pruned_weights=false, per_sample_weights=%per_sample_weights, compressed_indices_mapping=None, include_last_offset=true)
%res = torch.ops.aten.clone.default(self=%out, memory_format=None)
return (%res)
)";

  std::vector<c10::IValue> args = generateArgsForQuantizedEmbeddingBag();

  testStaticKernelEquality(graph, args);
}

TEST(StaticKernelTest, QuantizedEmbeddingBag4BitRowwiseOffsets) {
  const std::string graph =
      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
%out = torch.ops.quantized.embedding_bag_4bit_rowwise_offsets.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, pruned_weights=false, per_sample_weights=%per_sample_weights, compressed_indices_mapping=None, include_last_offset=true)
%res = torch.ops.aten.clone.default(self=%out, memory_format=None)
return (%res)
)";
  std::vector<c10::IValue> args = generateArgsForQuantizedEmbeddingBag();

  testStaticKernelEquality(graph, args);
}

TEST(StaticKernelTest, EmbeddingBag) {
  const std::string graph =
      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true)
%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
%res4 = torch.ops.aten.clone.default(self=%out3, memory_format=None)
return (%res1, %res2, %res3, %res4)
)";
  std::vector<c10::IValue> args = generateArgsForEmbeddingBag(false);
  testStaticKernelEquality(graph, args);

  // Test use_max_indices False
  const std::string graph2 =
      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true)
%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
return (%res1, %res2, %res3, %out2)
)";
  std::vector<c10::IValue> args2 = generateArgsForEmbeddingBag(false);
  testStaticKernelEquality(graph2, args2);
}

TEST(StaticKernelTest, EmbeddingBagPaddingIdx) {
  const std::string graph =
      R"(graph(%weight, %indices, %offsets, %per_sample_weights, %padding_idx):
%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.padding_idx(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true, padding_idx=%padding_idx)
%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
%res4 = torch.ops.aten.clone.default(self=%out3, memory_format=None)
return (%res1, %res2, %res3, %res4)
)";
  std::vector<c10::IValue> args = generateArgsForEmbeddingBag(true);
  testStaticKernelEquality(graph, args);

  // Test use_max_indices False
  const std::string graph2 =
      R"(graph(%weight, %indices, %offsets, %per_sample_weights, %padding_idx):
%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.padding_idx(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true, padding_idx=%padding_idx)
%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
return (%res1, %res2, %res3, %out2)
)";
  std::vector<c10::IValue> args2 = generateArgsForEmbeddingBag(true);
  testStaticKernelEquality(graph2, args2);
}

TEST(StaticKernelTest, Aten_ToCopy) {
  for (auto& target_dtype :
       {"None",
        "ScalarType::FLOAT",
        "ScalarType::DOUBLE",
        "ScalarType::HALF",
        "ScalarType::INT",
        "ScalarType::LONG"}) {
    for (auto& target_memory_format : {
             "None",
             "MemoryFormat::PreserveFormat",
             "MemoryFormat::ContiguousFormat",
         }) {
      for (auto& input_dtype :
           {at::kLong, at::kInt, at::kFloat, at::kDouble, at::kHalf}) {
        for (auto& permute_input : {true, false}) {
          const std::string graph = fmt::format(
              R"(graph(%input):
%out = torch.ops.aten._to_copy.default(self=%input, dtype={}, memory_format={})
return (%out)
)",
              target_dtype,
              target_memory_format);
          at::Tensor input =
              at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(input_dtype);
          if (permute_input) {
            input = input.permute({1, 0, 3, 2});
          }

          testStaticKernelEquality(graph, {input});
        }
      }
    }
  }
}

TEST(StaticKernelTest, Aten_ToCopy_Aliasing) {
  const std::string graph =
      R"(graph(%input):
          %out = torch.ops.aten._to_copy.default(self=%input, dtype=ScalarType::FLOAT, memory_format=None)
          return (%out))";

  at::Tensor input =
      at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(at::kFloat);

  torch::nativert::ExecutorConfig config;
  config.enableStaticCPUKernels = true;
  SimpleTestModelRunner runner(graph, config);

  // try standard aliasing case
  auto output = runner.run({input});
  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
  EXPECT_EQ(output[0].toTensor().dim(), 4);
  EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);
  output = runner.run({input});
  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
  EXPECT_EQ(output[0].toTensor().dim(), 4);
  EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);

  // try swap out input storage between runs
  at::Storage original_storage = input.storage();
  input.unsafeGetTensorImpl()->set_storage_keep_dtype(
      at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(at::kFloat).storage());
  output = runner.run({input});
  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
  EXPECT_FALSE(output[0].toTensor().storage().is_alias_of(original_storage));
  EXPECT_EQ(output[0].toTensor().dim(), 4);
  EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);

  // try to upsize between runs
  input.resize_({16, 16, 16, 16, 16});
  output = runner.run({input});
  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
  EXPECT_EQ(output[0].toTensor().dim(), 5);
  EXPECT_EQ(output[0].toTensor().numel(), 16 * 16 * 16 * 16 * 16);

  // try to downsize between runs
  input.resize_({4});
  output = runner.run({input});
  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
  EXPECT_EQ(output[0].toTensor().dim(), 1);
  EXPECT_EQ(output[0].toTensor().numel(), 4);

  // try to restride between runs
  input.as_strided_({3, 2}, {3, 6}).random_();
  output = runner.run({input});
  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
  EXPECT_EQ(output[0].toTensor().dim(), 2);
  EXPECT_EQ(output[0].toTensor().numel(), 3 * 2);
  for (int i = 0; i < 3; i += 1) {
    for (int j = 0; j < 2; j += 1) {
      EXPECT_EQ(
          output[0].toTensor().index({i, j}).item().toFloat(),
          input.index({i, j}).item().toFloat());
    }
  }
}

TEST(StaticKernelTest, MulScalar) {
  const std::string graph = R"(graph(%in0_t, %in1_t):
    %out = torch.ops.aten.mul.Scalar(self=%in0_t, other=%in1_t)
    return (%out)
  )";

  std::vector<std::pair<at::Tensor, std::vector<double>>> test_cases = {
      {at::rand({3, 4}), {2.0, -2.0, -2, 2, 0.0, 1e6, 1e-6, NAN, INFINITY}},
      {at::rand({2, 3, 4}), {2.0}},
      {at::rand({3, 4}, at::kFloat), {3.0}}, // fp32 tensor with int scalar
      {at::randint(0, 10, {3, 4}, at::kInt),
       {2.0}}, // int32 tensor with double scalar
      {at::rand({3, 4}, at::kHalf), {2.0}}, // half tensor with float scalar
      {at::rand({3, 4}, at::kBFloat16), {2.0}}, // bf16 tensor with float scalar
      {at::randint(0, 10, {3, 4}, at::kInt), {2}}, // int tensor with int scalar
      {at::randint(0, 10, {3, 4}, at::kLong),
       {2}}, // int64 tensor with int64 scalar,
      {at::rand({3, 4, 5}, at::kFloat).permute({2, 0, 1}),
       {2}}, // int64 strided tensor with int64 scalar
      {at::rand({3, 4}, at::kFloat).t(),
       {2}}, // int64 strided tensor with int64 scalar
      {at::rand({3, 4, 5}, at::kFloat).permute({2, 0, 1}),
       {2}}, // int64 strided tensor with int64 scalar
      {at::rand({3, 4}, at::kFloat).t(),
       {2}}, // int64 strided tensor with int64 scalar
  };

  for (const auto& [tensor, scalars] : test_cases) {
    for (double scalar : scalars) {
      std::vector<c10::IValue> inputs = {tensor, scalar};
      testStaticKernelEquality(graph, inputs);
    }
  }
}

TEST(StaticKernelTest, SymSizeInt) {
  const std::string graph = R"(graph(%self, %dim):
    %out = torch.ops.aten.sym_size.int(self=%self, dim=%dim)
    return (%out)
  )";

  // Define test cases with different tensors
  std::vector<at::Tensor> test_cases = {
      at::rand({3, 4, 5}), // standard 3D tensor
      at::rand({0, 4, 5}), // empty tensor
      at::rand({1}), // single-element tensor
      at::rand({2, 3, 4, 5, 6}), // high-dimensional tensor
      at::rand({3, 1, 5}) // tensor with one dimension as 1
  };

  // Iterate over each test case
  for (const auto& tensor : test_cases) {
    for (int64_t dim = 0; dim < tensor.dim(); ++dim) {
      std::vector<c10::IValue> inputs = {tensor, dim};
      testStaticKernelEquality(graph, inputs);
    }
  }
}

TEST(StaticKernelTest, BucketizeTensor) {
  const std::string graph =
      R"(graph(%input, %boundaries, %out_int32, %right):
%out = torch.ops.aten.bucketize.Tensor(self=%input, boundaries=%boundaries, out_int32=%out_int32, right=%right)
return (%out)
)";

  std::vector<std::pair<bool, bool>> test_cases = {
      {false, false}, {true, false}, {false, true}, {true, true}};

  for (const auto& [out_int32, right] : test_cases) {
    at::Tensor input = at::tensor({0.1, 2.5, 3.0, 4.5, 5.0}, at::kFloat);
    at::Tensor boundaries = at::tensor({1.0, 2.0, 3.0, 4.0}, at::kFloat);

    std::vector<c10::IValue> args = {input, boundaries, out_int32, right};

    testStaticKernelEquality(graph, args);
  }
}

TEST(StaticKernelTest, SliceScatter) {
  const std::string graph =
      R"(graph(%self, %src, %dim, %start, %end, %step):
%out = torch.ops.aten.slice_scatter.default(self=%self, src=%src, dim=%dim, start=%start, end=%end, step=%step)
return (%out)
)";

  // Create input tensors
  at::Tensor self = at::rand({5, 5}, at::kFloat);
  at::Tensor src = at::rand({2, 5}, at::kFloat);
  int64_t dim = 0;
  int64_t start = 1;
  int64_t end = 3;
  int64_t step = 1;

  // Create a vector of IValues to pass as inputs
  std::vector<c10::IValue> inputs = {self, src, dim, start, end, step};

  // Run the kernel and verify the output
  testStaticKernelEquality(graph, inputs);
}

TEST(StaticKernelTest, QuantizedEmbeddingBagBytePrepack) {
  const std::string graph = R"(
    graph(%input):
        %weight = torch.ops.quantized.embedding_bag_byte_prepack.default(weight=%input)
        %res = torch.ops.aten.clone.default(self=%weight, memory_format=None)
        return (%res)
  )";

  at::Tensor args1 = torch::randn({8, 16}, at::ScalarType::Float);

  testStaticKernelEquality(graph, {args1});
}

TEST(StaticKernelTest, QuantizedEmbeddingBagByteUnpack) {
  const std::string graph = R"(
    graph(%input):
        %weight = torch.ops.quantized.embedding_bag_byte_prepack.default(weight=%input)
        %output = torch.ops.quantized.embedding_bag_byte_unpack.default(weight=%weight)
        %res = torch.ops.aten.clone.default(self=%output, memory_format=None)
        return (%res)
  )";

  at::Tensor args1 = torch::randn({8, 16}, at::ScalarType::Float);

  testStaticKernelEquality(graph, {args1});
}

TEST(StaticKernelTest, QuantizedLinear) {
  const std::string graph = R"(
    graph(%input, %weights):
        %packed_params = torch.ops.quantized.linear_prepack.default(W=%weights, B=None)
        %1254 = torch.ops.quantized.linear.default(X=%input, W_prepack=%packed_params, Y_scale_i=1.0, Y_zero_point_i=1)
        %res = torch.ops.aten.dequantize.self(self=%1254)
        return (%res)
  )";

  at::Tensor input =
      at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQUInt8);
  at::Tensor weight =
      at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQInt8);

  testStaticKernelEquality(graph, {input, weight});
}

TEST(NativeKernelTest, View) {
  const std::string source =
      R"(graph(%self):
%ret = torch.ops.aten.view.default(self=%self, size=[36])
%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
return (%cloned)
)";

  auto self0 = at::rand({6, 6});
  std::vector<c10::IValue> args{self0};
  testStaticKernelEquality(source, args, true);
}

TEST(NativeKernelTest, Permute) {
  const std::string source =
      R"(graph(%self):
%ret = torch.ops.aten.permute.default(self=%self, dims=[1, 0])
%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
return (%cloned)
)";

  auto self0 = at::rand({2, 3});
  std::vector<c10::IValue> args{self0};
  testStaticKernelEquality(source, args, true);
}

TEST(NativeKernelTest, Reshape) {
  const std::string source =
      R"(graph(%self):
%ret = torch.ops.aten.reshape.default(self=%self, shape=[9, 4])
%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
return (%cloned)
)";

  auto self0 = at::rand({3, 3, 4});
  std::vector<c10::IValue> args{self0};
  testStaticKernelEquality(source, args, true);
}

TEST(NativeKernelTest, Select) {
  static constexpr std::string_view source =
      R"(graph(%self):
%ret = torch.ops.aten.select.int(self=%self, dim=1, index=0)
%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
return (%cloned)
)";

  auto self0 = at::rand({3, 3, 3});
  std::vector<c10::IValue> args{self0};
  testStaticKernelEquality(source, args, true);
}

TEST(NativeKernelTest, Slice) {
  const std::string graph =
      R"(graph(%self):
%ret = torch.ops.aten.slice.Tensor(self=%self, dim=0, start=1, end=3, step=1)
%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
return (%cloned)
)";

  auto self0 = at::rand({5, 5});
  std::vector<c10::IValue> args{self0};
  testStaticKernelEquality(graph, args, true);
}

TEST(NativeKernelTest, Split) {
  const std::string graph =
      R"(graph(%self):
%ret = torch.ops.aten.split.Tensor(self=%self, split_size=2, dim=0)
return (%ret)
)";

  auto self0 = at::rand({6, 6});
  std::vector<c10::IValue> args{self0};
  testStaticKernelEquality(graph, args, true);
}

TEST(NativeKernelTest, SplitWithSizes) {
  const std::string graph =
      R"(graph(%self):
%ret = torch.ops.aten.split_with_sizes.default(self=%self, split_sizes=[2, 4], dim=0)
return (%ret)
)";

  auto self0 = at::rand({6, 6});
  std::vector<c10::IValue> args{self0};
  testStaticKernelEquality(graph, args, true);
}

TEST(NativeKernelTest, TensorSplitSections) {
  const std::string graph =
      R"(graph(%self):
%ret = torch.ops.aten.tensor_split.sections(self=%self, sections=3, dim=0)
return (%ret)
)";

  auto self0 = at::rand({9, 3});
  std::vector<c10::IValue> args{self0};
  testStaticKernelEquality(graph, args, true);
}

TEST(StaticKernelTest, Stack) {
  const std::string graph =
      R"(graph(%tensors):
%ret = torch.ops.aten.stack.default(tensors=%tensors, dim=0)
return (%ret)
)";

  auto tensor1 = at::rand({2, 3});
  auto tensor2 = at::rand({2, 3});
  auto tensor3 = at::rand({2, 3});
  std::vector<c10::IValue> args{
      std::vector<at::Tensor>{tensor1, tensor2, tensor3}};
  testStaticKernelEquality(graph, args, true);
}

TEST(NativeKernelTest, Item) {
  const std::string graph =
      R"(graph(%self):
%ret = torch.ops.aten.item.default(self=%self)
return (%ret)
)";

  auto self0 = at::tensor({42.0});
  std::vector<c10::IValue> args{self0};
  testStaticKernelEquality(graph, args, true);
}

TEST(NativeKernelTest, Narrow) {
  const std::string graph =
      R"(graph(%self, %dim, %start, %length):
%ret = torch.ops.aten.narrow.default(self=%self, dim=%dim, start=%start, length=%length)
%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
return (%cloned)
)";

  auto self = at::rand({5, 5});
  int64_t dim = 1;
  int64_t start = 1;
  int64_t length = 3;
  std::vector<c10::IValue> args{self, dim, start, length};
  testStaticKernelEquality(graph, args, true);
}
} // namespace torch::nativert
[nativert] oss static kernel tests (#161087) Summary: att - should be no-op Test Plan: buck2 test //caffe2/test/cpp/nativert:static_kernel_ops_tests Tests finished: Pass 24. Fail 0. Fatal 0. Skip 0. Build failure 0 Rollback Plan: Differential Revision: D80216488 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161087 Approved by: https://github.com/georgiaphillips, https://github.com/henryoier 2025-08-21 19:42:21 +00:00			`#include <fmt/format.h>`
			`#include <gtest/gtest.h>`
			`#include <torch/torch.h>`
			`#include <random>`
			`#include "test/cpp/nativert/static_kernel_test_utils.h" // @manual`

			`namespace torch::nativert {`

			`namespace {`
			`std::vector<c10::IValue> generateArgsForQuantizedEmbeddingBag() {`
			`// Set seed for reproducibility`
			`std::random_device rd;`
			`std::mt19937 gen(rd());`
			`std::uniform_int_distribution<int> int_dis(0, 15); // num_embeddings - 1`
			`int num_embeddings = 16;`
			`int embedding_dim = 32;`
			`int num_lengths = 10;`

			`auto weight =`
			`at::randint(0, 255, {num_embeddings, embedding_dim}).to(at::kByte);`

			`// Generate random lengths`
			`std::vector<int> np_lengths(num_lengths);`
			`for (auto& length : np_lengths) {`
			`length = int_dis(gen);`
			`}`
			`int total_length = 0;`
			`for (const auto& length : np_lengths) {`
			`total_length += length;`
			`}`
			`// Generate random indices`
			`at::Tensor indices =`
			`torch::empty({total_length}, torch::dtype(torch::kInt32));`
			`auto indices_accessor = indices.accessor<int, 1>();`
			`for (int i = 0; i < total_length; ++i) {`
			`indices_accessor[i] = int_dis(gen);`
			`}`
			`// Create lengths tensor`
			`at::Tensor lengths = torch::from_blob(`
			`np_lengths.data(), {num_lengths}, torch::dtype(torch::kInt32));`
			`// Calculate offsets`
			`at::Tensor offsets = torch::cat(`
			`{torch::zeros({1}, torch::dtype(torch::kInt32)),`
			`torch::cumsum(lengths, 0)});`
			`offsets = offsets.to(torch::dtype(torch::kInt32));`

			`at::Tensor per_sample_weights = at::randn(indices.sizes());`

			`std::vector<c10::IValue> args{weight, indices, offsets, per_sample_weights};`
			`return args;`
			`}`

			`std::vector<c10::IValue> generateArgsForEmbeddingBag(bool include_padding_idx) {`
			`torch::Tensor weight = torch::randn({10, 3}, torch::dtype(torch::kFloat32));`
			`torch::Tensor indices =`
			`torch::randint(0, 10, {20}, torch::dtype(torch::kInt64));`
			`torch::Tensor offsets =`
			`torch::tensor({0, 5, 10, 15, 20}, torch::dtype(torch::kInt64));`
			`torch::Tensor per_sample_weights =`
			`torch::rand({20}, torch::dtype(torch::kFloat32));`
			`// Define the padding_idx`
			`int64_t padding_idx = 1;`
			`// Create a vector of IValues to store the arguments`
			`std::vector<c10::IValue> args;`
			`args.emplace_back(weight);`
			`args.emplace_back(indices);`
			`args.emplace_back(offsets);`
			`args.emplace_back(per_sample_weights);`
			`if (include_padding_idx) {`
			`args.emplace_back(padding_idx);`
			`}`
			`return args;`
			`}`
			`} // namespace`

			`TEST(StaticKernelTest, QuantizedEmbeddingBagByteRowwiseOffsets) {`
			`const std::string graph =`
			`R"(graph(%weight, %indices, %offsets, %per_sample_weights):`
			`%out = torch.ops.quantized.embedding_bag_byte_rowwise_offsets.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, pruned_weights=false, per_sample_weights=%per_sample_weights, compressed_indices_mapping=None, include_last_offset=true)`
			`%res = torch.ops.aten.clone.default(self=%out, memory_format=None)`
			`return (%res)`
			`)";`

			`std::vector<c10::IValue> args = generateArgsForQuantizedEmbeddingBag();`

			`testStaticKernelEquality(graph, args);`
			`}`

			`TEST(StaticKernelTest, QuantizedEmbeddingBag4BitRowwiseOffsets) {`
			`const std::string graph =`
			`R"(graph(%weight, %indices, %offsets, %per_sample_weights):`
			`%out = torch.ops.quantized.embedding_bag_4bit_rowwise_offsets.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, pruned_weights=false, per_sample_weights=%per_sample_weights, compressed_indices_mapping=None, include_last_offset=true)`
			`%res = torch.ops.aten.clone.default(self=%out, memory_format=None)`
			`return (%res)`
			`)";`
			`std::vector<c10::IValue> args = generateArgsForQuantizedEmbeddingBag();`

			`testStaticKernelEquality(graph, args);`
			`}`

			`TEST(StaticKernelTest, EmbeddingBag) {`
			`const std::string graph =`
			`R"(graph(%weight, %indices, %offsets, %per_sample_weights):`
			`%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true)`
			`%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)`
			`%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)`
			`%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)`
			`%res4 = torch.ops.aten.clone.default(self=%out3, memory_format=None)`
			`return (%res1, %res2, %res3, %res4)`
			`)";`
			`std::vector<c10::IValue> args = generateArgsForEmbeddingBag(false);`
			`testStaticKernelEquality(graph, args);`

			`// Test use_max_indices False`
			`const std::string graph2 =`
			`R"(graph(%weight, %indices, %offsets, %per_sample_weights):`
			`%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true)`
			`%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)`
			`%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)`
			`%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)`
			`return (%res1, %res2, %res3, %out2)`
			`)";`
			`std::vector<c10::IValue> args2 = generateArgsForEmbeddingBag(false);`
			`testStaticKernelEquality(graph2, args2);`
			`}`

			`TEST(StaticKernelTest, EmbeddingBagPaddingIdx) {`
			`const std::string graph =`
			`R"(graph(%weight, %indices, %offsets, %per_sample_weights, %padding_idx):`
			`%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.padding_idx(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true, padding_idx=%padding_idx)`
			`%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)`
			`%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)`
			`%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)`
			`%res4 = torch.ops.aten.clone.default(self=%out3, memory_format=None)`
			`return (%res1, %res2, %res3, %res4)`
			`)";`
			`std::vector<c10::IValue> args = generateArgsForEmbeddingBag(true);`
			`testStaticKernelEquality(graph, args);`

			`// Test use_max_indices False`
			`const std::string graph2 =`
			`R"(graph(%weight, %indices, %offsets, %per_sample_weights, %padding_idx):`
			`%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.padding_idx(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true, padding_idx=%padding_idx)`
			`%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)`
			`%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)`
			`%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)`
			`return (%res1, %res2, %res3, %out2)`
			`)";`
			`std::vector<c10::IValue> args2 = generateArgsForEmbeddingBag(true);`
			`testStaticKernelEquality(graph2, args2);`
			`}`

			`TEST(StaticKernelTest, Aten_ToCopy) {`
			`for (auto& target_dtype :`
			`{"None",`
			`"ScalarType::FLOAT",`
			`"ScalarType::DOUBLE",`
			`"ScalarType::HALF",`
			`"ScalarType::INT",`
			`"ScalarType::LONG"}) {`
			`for (auto& target_memory_format : {`
			`"None",`
			`"MemoryFormat::PreserveFormat",`
			`"MemoryFormat::ContiguousFormat",`
			`}) {`
			`for (auto& input_dtype :`
			`{at::kLong, at::kInt, at::kFloat, at::kDouble, at::kHalf}) {`
			`for (auto& permute_input : {true, false}) {`
			`const std::string graph = fmt::format(`
			`R"(graph(%input):`
			`%out = torch.ops.aten._to_copy.default(self=%input, dtype={}, memory_format={})`
			`return (%out)`
			`)",`
			`target_dtype,`
			`target_memory_format);`
			`at::Tensor input =`
			`at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(input_dtype);`
			`if (permute_input) {`
			`input = input.permute({1, 0, 3, 2});`
			`}`

			`testStaticKernelEquality(graph, {input});`
			`}`
			`}`
			`}`
			`}`
			`}`

			`TEST(StaticKernelTest, Aten_ToCopy_Aliasing) {`
			`const std::string graph =`
			`R"(graph(%input):`
			`%out = torch.ops.aten._to_copy.default(self=%input, dtype=ScalarType::FLOAT, memory_format=None)`
			`return (%out))";`

			`at::Tensor input =`
			`at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(at::kFloat);`

			`torch::nativert::ExecutorConfig config;`
			`config.enableStaticCPUKernels = true;`
			`SimpleTestModelRunner runner(graph, config);`

			`// try standard aliasing case`
			`auto output = runner.run({input});`
			`EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));`
			`EXPECT_EQ(output[0].toTensor().dim(), 4);`
			`EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);`
			`output = runner.run({input});`
			`EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));`
			`EXPECT_EQ(output[0].toTensor().dim(), 4);`
			`EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);`

			`// try swap out input storage between runs`
			`at::Storage original_storage = input.storage();`
			`input.unsafeGetTensorImpl()->set_storage_keep_dtype(`
			`at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(at::kFloat).storage());`
			`output = runner.run({input});`
			`EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));`
			`EXPECT_FALSE(output[0].toTensor().storage().is_alias_of(original_storage));`
			`EXPECT_EQ(output[0].toTensor().dim(), 4);`
			`EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);`

			`// try to upsize between runs`
			`input.resize_({16, 16, 16, 16, 16});`
			`output = runner.run({input});`
			`EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));`
			`EXPECT_EQ(output[0].toTensor().dim(), 5);`
			`EXPECT_EQ(output[0].toTensor().numel(), 16 * 16 * 16 * 16 * 16);`

			`// try to downsize between runs`
			`input.resize_({4});`
			`output = runner.run({input});`
			`EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));`
			`EXPECT_EQ(output[0].toTensor().dim(), 1);`
			`EXPECT_EQ(output[0].toTensor().numel(), 4);`

			`// try to restride between runs`
			`input.as_strided_({3, 2}, {3, 6}).random_();`
			`output = runner.run({input});`
			`EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));`
			`EXPECT_EQ(output[0].toTensor().dim(), 2);`
			`EXPECT_EQ(output[0].toTensor().numel(), 3 * 2);`
			`for (int i = 0; i < 3; i += 1) {`
			`for (int j = 0; j < 2; j += 1) {`
			`EXPECT_EQ(`
			`output[0].toTensor().index({i, j}).item().toFloat(),`
			`input.index({i, j}).item().toFloat());`
			`}`
			`}`
			`}`

			`TEST(StaticKernelTest, MulScalar) {`
			`const std::string graph = R"(graph(%in0_t, %in1_t):`
			`%out = torch.ops.aten.mul.Scalar(self=%in0_t, other=%in1_t)`
			`return (%out)`
			`)";`

			`std::vector<std::pair<at::Tensor, std::vector<double>>> test_cases = {`
			`{at::rand({3, 4}), {2.0, -2.0, -2, 2, 0.0, 1e6, 1e-6, NAN, INFINITY}},`
			`{at::rand({2, 3, 4}), {2.0}},`
			`{at::rand({3, 4}, at::kFloat), {3.0}}, // fp32 tensor with int scalar`
			`{at::randint(0, 10, {3, 4}, at::kInt),`
			`{2.0}}, // int32 tensor with double scalar`
			`{at::rand({3, 4}, at::kHalf), {2.0}}, // half tensor with float scalar`
			`{at::rand({3, 4}, at::kBFloat16), {2.0}}, // bf16 tensor with float scalar`
			`{at::randint(0, 10, {3, 4}, at::kInt), {2}}, // int tensor with int scalar`
			`{at::randint(0, 10, {3, 4}, at::kLong),`
			`{2}}, // int64 tensor with int64 scalar,`
			`{at::rand({3, 4, 5}, at::kFloat).permute({2, 0, 1}),`
			`{2}}, // int64 strided tensor with int64 scalar`
			`{at::rand({3, 4}, at::kFloat).t(),`
			`{2}}, // int64 strided tensor with int64 scalar`
			`{at::rand({3, 4, 5}, at::kFloat).permute({2, 0, 1}),`
			`{2}}, // int64 strided tensor with int64 scalar`
			`{at::rand({3, 4}, at::kFloat).t(),`
			`{2}}, // int64 strided tensor with int64 scalar`
			`};`

			`for (const auto& [tensor, scalars] : test_cases) {`
			`for (double scalar : scalars) {`
			`std::vector<c10::IValue> inputs = {tensor, scalar};`
			`testStaticKernelEquality(graph, inputs);`
			`}`
			`}`
			`}`

			`TEST(StaticKernelTest, SymSizeInt) {`
			`const std::string graph = R"(graph(%self, %dim):`
			`%out = torch.ops.aten.sym_size.int(self=%self, dim=%dim)`
			`return (%out)`
			`)";`

			`// Define test cases with different tensors`
			`std::vector<at::Tensor> test_cases = {`
			`at::rand({3, 4, 5}), // standard 3D tensor`
			`at::rand({0, 4, 5}), // empty tensor`
			`at::rand({1}), // single-element tensor`
			`at::rand({2, 3, 4, 5, 6}), // high-dimensional tensor`
			`at::rand({3, 1, 5}) // tensor with one dimension as 1`
			`};`

			`// Iterate over each test case`
			`for (const auto& tensor : test_cases) {`
			`for (int64_t dim = 0; dim < tensor.dim(); ++dim) {`
			`std::vector<c10::IValue> inputs = {tensor, dim};`
			`testStaticKernelEquality(graph, inputs);`
			`}`
			`}`
			`}`

			`TEST(StaticKernelTest, BucketizeTensor) {`
			`const std::string graph =`
			`R"(graph(%input, %boundaries, %out_int32, %right):`
			`%out = torch.ops.aten.bucketize.Tensor(self=%input, boundaries=%boundaries, out_int32=%out_int32, right=%right)`
			`return (%out)`
			`)";`

			`std::vector<std::pair<bool, bool>> test_cases = {`
			`{false, false}, {true, false}, {false, true}, {true, true}};`

			`for (const auto& [out_int32, right] : test_cases) {`
			`at::Tensor input = at::tensor({0.1, 2.5, 3.0, 4.5, 5.0}, at::kFloat);`
			`at::Tensor boundaries = at::tensor({1.0, 2.0, 3.0, 4.0}, at::kFloat);`

			`std::vector<c10::IValue> args = {input, boundaries, out_int32, right};`

			`testStaticKernelEquality(graph, args);`
			`}`
			`}`

			`TEST(StaticKernelTest, SliceScatter) {`
			`const std::string graph =`
			`R"(graph(%self, %src, %dim, %start, %end, %step):`
			`%out = torch.ops.aten.slice_scatter.default(self=%self, src=%src, dim=%dim, start=%start, end=%end, step=%step)`
			`return (%out)`
			`)";`

			`// Create input tensors`
			`at::Tensor self = at::rand({5, 5}, at::kFloat);`
			`at::Tensor src = at::rand({2, 5}, at::kFloat);`
			`int64_t dim = 0;`
			`int64_t start = 1;`
			`int64_t end = 3;`
			`int64_t step = 1;`

			`// Create a vector of IValues to pass as inputs`
			`std::vector<c10::IValue> inputs = {self, src, dim, start, end, step};`

			`// Run the kernel and verify the output`
			`testStaticKernelEquality(graph, inputs);`
			`}`

			`TEST(StaticKernelTest, QuantizedEmbeddingBagBytePrepack) {`
			`const std::string graph = R"(`
			`graph(%input):`
			`%weight = torch.ops.quantized.embedding_bag_byte_prepack.default(weight=%input)`
			`%res = torch.ops.aten.clone.default(self=%weight, memory_format=None)`
			`return (%res)`
			`)";`

			`at::Tensor args1 = torch::randn({8, 16}, at::ScalarType::Float);`

			`testStaticKernelEquality(graph, {args1});`
			`}`

			`TEST(StaticKernelTest, QuantizedEmbeddingBagByteUnpack) {`
			`const std::string graph = R"(`
			`graph(%input):`
			`%weight = torch.ops.quantized.embedding_bag_byte_prepack.default(weight=%input)`
			`%output = torch.ops.quantized.embedding_bag_byte_unpack.default(weight=%weight)`
			`%res = torch.ops.aten.clone.default(self=%output, memory_format=None)`
			`return (%res)`
			`)";`

			`at::Tensor args1 = torch::randn({8, 16}, at::ScalarType::Float);`

			`testStaticKernelEquality(graph, {args1});`
			`}`

			`TEST(StaticKernelTest, QuantizedLinear) {`
			`const std::string graph = R"(`
			`graph(%input, %weights):`
			`%packed_params = torch.ops.quantized.linear_prepack.default(W=%weights, B=None)`
			`%1254 = torch.ops.quantized.linear.default(X=%input, W_prepack=%packed_params, Y_scale_i=1.0, Y_zero_point_i=1)`
			`%res = torch.ops.aten.dequantize.self(self=%1254)`
			`return (%res)`
			`)";`

			`at::Tensor input =`
			`at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQUInt8);`
			`at::Tensor weight =`
			`at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQInt8);`

			`testStaticKernelEquality(graph, {input, weight});`
			`}`

			`TEST(NativeKernelTest, View) {`
			`const std::string source =`
			`R"(graph(%self):`
			`%ret = torch.ops.aten.view.default(self=%self, size=[36])`
			`%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)`
			`return (%cloned)`
			`)";`

			`auto self0 = at::rand({6, 6});`
			`std::vector<c10::IValue> args{self0};`
			`testStaticKernelEquality(source, args, true);`
			`}`

			`TEST(NativeKernelTest, Permute) {`
			`const std::string source =`
			`R"(graph(%self):`
			`%ret = torch.ops.aten.permute.default(self=%self, dims=[1, 0])`
			`%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)`
			`return (%cloned)`
			`)";`

			`auto self0 = at::rand({2, 3});`
			`std::vector<c10::IValue> args{self0};`
			`testStaticKernelEquality(source, args, true);`
			`}`

			`TEST(NativeKernelTest, Reshape) {`
			`const std::string source =`
			`R"(graph(%self):`
			`%ret = torch.ops.aten.reshape.default(self=%self, shape=[9, 4])`
			`%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)`
			`return (%cloned)`
			`)";`

			`auto self0 = at::rand({3, 3, 4});`
			`std::vector<c10::IValue> args{self0};`
			`testStaticKernelEquality(source, args, true);`
			`}`

			`TEST(NativeKernelTest, Select) {`
			`static constexpr std::string_view source =`
			`R"(graph(%self):`
			`%ret = torch.ops.aten.select.int(self=%self, dim=1, index=0)`
			`%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)`
			`return (%cloned)`
			`)";`

			`auto self0 = at::rand({3, 3, 3});`
			`std::vector<c10::IValue> args{self0};`
			`testStaticKernelEquality(source, args, true);`
			`}`

			`TEST(NativeKernelTest, Slice) {`
			`const std::string graph =`
			`R"(graph(%self):`
			`%ret = torch.ops.aten.slice.Tensor(self=%self, dim=0, start=1, end=3, step=1)`
			`%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)`
			`return (%cloned)`
			`)";`

			`auto self0 = at::rand({5, 5});`
			`std::vector<c10::IValue> args{self0};`
			`testStaticKernelEquality(graph, args, true);`
			`}`

			`TEST(NativeKernelTest, Split) {`
			`const std::string graph =`
			`R"(graph(%self):`
			`%ret = torch.ops.aten.split.Tensor(self=%self, split_size=2, dim=0)`
			`return (%ret)`
			`)";`

			`auto self0 = at::rand({6, 6});`
			`std::vector<c10::IValue> args{self0};`
			`testStaticKernelEquality(graph, args, true);`
			`}`

			`TEST(NativeKernelTest, SplitWithSizes) {`
			`const std::string graph =`
			`R"(graph(%self):`
			`%ret = torch.ops.aten.split_with_sizes.default(self=%self, split_sizes=[2, 4], dim=0)`
			`return (%ret)`
			`)";`

			`auto self0 = at::rand({6, 6});`
			`std::vector<c10::IValue> args{self0};`
			`testStaticKernelEquality(graph, args, true);`
			`}`

			`TEST(NativeKernelTest, TensorSplitSections) {`
			`const std::string graph =`
			`R"(graph(%self):`
			`%ret = torch.ops.aten.tensor_split.sections(self=%self, sections=3, dim=0)`
			`return (%ret)`
			`)";`

			`auto self0 = at::rand({9, 3});`
			`std::vector<c10::IValue> args{self0};`
			`testStaticKernelEquality(graph, args, true);`
			`}`

			`TEST(StaticKernelTest, Stack) {`
			`const std::string graph =`
			`R"(graph(%tensors):`
			`%ret = torch.ops.aten.stack.default(tensors=%tensors, dim=0)`
			`return (%ret)`
			`)";`

			`auto tensor1 = at::rand({2, 3});`
			`auto tensor2 = at::rand({2, 3});`
			`auto tensor3 = at::rand({2, 3});`
			`std::vector<c10::IValue> args{`
			`std::vector<at::Tensor>{tensor1, tensor2, tensor3}};`
			`testStaticKernelEquality(graph, args, true);`
			`}`

			`TEST(NativeKernelTest, Item) {`
			`const std::string graph =`
			`R"(graph(%self):`
			`%ret = torch.ops.aten.item.default(self=%self)`
			`return (%ret)`
			`)";`

			`auto self0 = at::tensor({42.0});`
			`std::vector<c10::IValue> args{self0};`
			`testStaticKernelEquality(graph, args, true);`
			`}`

			`TEST(NativeKernelTest, Narrow) {`
			`const std::string graph =`
			`R"(graph(%self, %dim, %start, %length):`
			`%ret = torch.ops.aten.narrow.default(self=%self, dim=%dim, start=%start, length=%length)`
			`%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)`
			`return (%cloned)`
			`)";`

			`auto self = at::rand({5, 5});`
			`int64_t dim = 1;`
			`int64_t start = 1;`
			`int64_t length = 3;`
			`std::vector<c10::IValue> args{self, dim, start, length};`
			`testStaticKernelEquality(graph, args, true);`
			`}`
			`} // namespace torch::nativert`