#include #include #include #include #include "test/cpp/nativert/static_kernel_test_utils.h" // @manual namespace torch::nativert { namespace { std::vector generateArgsForQuantizedEmbeddingBag() { // Set seed for reproducibility std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution int_dis(0, 15); // num_embeddings - 1 int num_embeddings = 16; int embedding_dim = 32; int num_lengths = 10; auto weight = at::randint(0, 255, {num_embeddings, embedding_dim}).to(at::kByte); // Generate random lengths std::vector np_lengths(num_lengths); for (auto& length : np_lengths) { length = int_dis(gen); } int total_length = 0; for (const auto& length : np_lengths) { total_length += length; } // Generate random indices at::Tensor indices = torch::empty({total_length}, torch::dtype(torch::kInt32)); auto indices_accessor = indices.accessor(); for (int i = 0; i < total_length; ++i) { indices_accessor[i] = int_dis(gen); } // Create lengths tensor at::Tensor lengths = torch::from_blob( np_lengths.data(), {num_lengths}, torch::dtype(torch::kInt32)); // Calculate offsets at::Tensor offsets = torch::cat( {torch::zeros({1}, torch::dtype(torch::kInt32)), torch::cumsum(lengths, 0)}); offsets = offsets.to(torch::dtype(torch::kInt32)); at::Tensor per_sample_weights = at::randn(indices.sizes()); std::vector args{weight, indices, offsets, per_sample_weights}; return args; } std::vector generateArgsForEmbeddingBag(bool include_padding_idx) { torch::Tensor weight = torch::randn({10, 3}, torch::dtype(torch::kFloat32)); torch::Tensor indices = torch::randint(0, 10, {20}, torch::dtype(torch::kInt64)); torch::Tensor offsets = torch::tensor({0, 5, 10, 15, 20}, torch::dtype(torch::kInt64)); torch::Tensor per_sample_weights = torch::rand({20}, torch::dtype(torch::kFloat32)); // Define the padding_idx int64_t padding_idx = 1; // Create a vector of IValues to store the arguments std::vector args; args.emplace_back(weight); args.emplace_back(indices); args.emplace_back(offsets); args.emplace_back(per_sample_weights); if (include_padding_idx) { args.emplace_back(padding_idx); } return args; } } // namespace TEST(StaticKernelTest, QuantizedEmbeddingBagByteRowwiseOffsets) { const std::string graph = R"(graph(%weight, %indices, %offsets, %per_sample_weights): %out = torch.ops.quantized.embedding_bag_byte_rowwise_offsets.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, pruned_weights=false, per_sample_weights=%per_sample_weights, compressed_indices_mapping=None, include_last_offset=true) %res = torch.ops.aten.clone.default(self=%out, memory_format=None) return (%res) )"; std::vector args = generateArgsForQuantizedEmbeddingBag(); testStaticKernelEquality(graph, args); } TEST(StaticKernelTest, QuantizedEmbeddingBag4BitRowwiseOffsets) { const std::string graph = R"(graph(%weight, %indices, %offsets, %per_sample_weights): %out = torch.ops.quantized.embedding_bag_4bit_rowwise_offsets.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, pruned_weights=false, per_sample_weights=%per_sample_weights, compressed_indices_mapping=None, include_last_offset=true) %res = torch.ops.aten.clone.default(self=%out, memory_format=None) return (%res) )"; std::vector args = generateArgsForQuantizedEmbeddingBag(); testStaticKernelEquality(graph, args); } TEST(StaticKernelTest, EmbeddingBag) { const std::string graph = R"(graph(%weight, %indices, %offsets, %per_sample_weights): %out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true) %res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None) %res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None) %res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None) %res4 = torch.ops.aten.clone.default(self=%out3, memory_format=None) return (%res1, %res2, %res3, %res4) )"; std::vector args = generateArgsForEmbeddingBag(false); testStaticKernelEquality(graph, args); // Test use_max_indices False const std::string graph2 = R"(graph(%weight, %indices, %offsets, %per_sample_weights): %out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true) %res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None) %res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None) %res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None) return (%res1, %res2, %res3, %out2) )"; std::vector args2 = generateArgsForEmbeddingBag(false); testStaticKernelEquality(graph2, args2); } TEST(StaticKernelTest, EmbeddingBagPaddingIdx) { const std::string graph = R"(graph(%weight, %indices, %offsets, %per_sample_weights, %padding_idx): %out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.padding_idx(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true, padding_idx=%padding_idx) %res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None) %res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None) %res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None) %res4 = torch.ops.aten.clone.default(self=%out3, memory_format=None) return (%res1, %res2, %res3, %res4) )"; std::vector args = generateArgsForEmbeddingBag(true); testStaticKernelEquality(graph, args); // Test use_max_indices False const std::string graph2 = R"(graph(%weight, %indices, %offsets, %per_sample_weights, %padding_idx): %out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.padding_idx(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true, padding_idx=%padding_idx) %res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None) %res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None) %res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None) return (%res1, %res2, %res3, %out2) )"; std::vector args2 = generateArgsForEmbeddingBag(true); testStaticKernelEquality(graph2, args2); } TEST(StaticKernelTest, Aten_ToCopy) { for (auto& target_dtype : {"None", "ScalarType::FLOAT", "ScalarType::DOUBLE", "ScalarType::HALF", "ScalarType::INT", "ScalarType::LONG"}) { for (auto& target_memory_format : { "None", "MemoryFormat::PreserveFormat", "MemoryFormat::ContiguousFormat", }) { for (auto& input_dtype : {at::kLong, at::kInt, at::kFloat, at::kDouble, at::kHalf}) { for (auto& permute_input : {true, false}) { const std::string graph = fmt::format( R"(graph(%input): %out = torch.ops.aten._to_copy.default(self=%input, dtype={}, memory_format={}) return (%out) )", target_dtype, target_memory_format); at::Tensor input = at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(input_dtype); if (permute_input) { input = input.permute({1, 0, 3, 2}); } testStaticKernelEquality(graph, {input}); } } } } } TEST(StaticKernelTest, Aten_ToCopy_Aliasing) { const std::string graph = R"(graph(%input): %out = torch.ops.aten._to_copy.default(self=%input, dtype=ScalarType::FLOAT, memory_format=None) return (%out))"; at::Tensor input = at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(at::kFloat); torch::nativert::ExecutorConfig config; config.enableStaticCPUKernels = true; SimpleTestModelRunner runner(graph, config); // try standard aliasing case auto output = runner.run({input}); EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage())); EXPECT_EQ(output[0].toTensor().dim(), 4); EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8); output = runner.run({input}); EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage())); EXPECT_EQ(output[0].toTensor().dim(), 4); EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8); // try swap out input storage between runs at::Storage original_storage = input.storage(); input.unsafeGetTensorImpl()->set_storage_keep_dtype( at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(at::kFloat).storage()); output = runner.run({input}); EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage())); EXPECT_FALSE(output[0].toTensor().storage().is_alias_of(original_storage)); EXPECT_EQ(output[0].toTensor().dim(), 4); EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8); // try to upsize between runs input.resize_({16, 16, 16, 16, 16}); output = runner.run({input}); EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage())); EXPECT_EQ(output[0].toTensor().dim(), 5); EXPECT_EQ(output[0].toTensor().numel(), 16 * 16 * 16 * 16 * 16); // try to downsize between runs input.resize_({4}); output = runner.run({input}); EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage())); EXPECT_EQ(output[0].toTensor().dim(), 1); EXPECT_EQ(output[0].toTensor().numel(), 4); // try to restride between runs input.as_strided_({3, 2}, {3, 6}).random_(); output = runner.run({input}); EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage())); EXPECT_EQ(output[0].toTensor().dim(), 2); EXPECT_EQ(output[0].toTensor().numel(), 3 * 2); for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 2; j += 1) { EXPECT_EQ( output[0].toTensor().index({i, j}).item().toFloat(), input.index({i, j}).item().toFloat()); } } } TEST(StaticKernelTest, MulScalar) { const std::string graph = R"(graph(%in0_t, %in1_t): %out = torch.ops.aten.mul.Scalar(self=%in0_t, other=%in1_t) return (%out) )"; std::vector>> test_cases = { {at::rand({3, 4}), {2.0, -2.0, -2, 2, 0.0, 1e6, 1e-6, NAN, INFINITY}}, {at::rand({2, 3, 4}), {2.0}}, {at::rand({3, 4}, at::kFloat), {3.0}}, // fp32 tensor with int scalar {at::randint(0, 10, {3, 4}, at::kInt), {2.0}}, // int32 tensor with double scalar {at::rand({3, 4}, at::kHalf), {2.0}}, // half tensor with float scalar {at::rand({3, 4}, at::kBFloat16), {2.0}}, // bf16 tensor with float scalar {at::randint(0, 10, {3, 4}, at::kInt), {2}}, // int tensor with int scalar {at::randint(0, 10, {3, 4}, at::kLong), {2}}, // int64 tensor with int64 scalar, {at::rand({3, 4, 5}, at::kFloat).permute({2, 0, 1}), {2}}, // int64 strided tensor with int64 scalar {at::rand({3, 4}, at::kFloat).t(), {2}}, // int64 strided tensor with int64 scalar {at::rand({3, 4, 5}, at::kFloat).permute({2, 0, 1}), {2}}, // int64 strided tensor with int64 scalar {at::rand({3, 4}, at::kFloat).t(), {2}}, // int64 strided tensor with int64 scalar }; for (const auto& [tensor, scalars] : test_cases) { for (double scalar : scalars) { std::vector inputs = {tensor, scalar}; testStaticKernelEquality(graph, inputs); } } } TEST(StaticKernelTest, SymSizeInt) { const std::string graph = R"(graph(%self, %dim): %out = torch.ops.aten.sym_size.int(self=%self, dim=%dim) return (%out) )"; // Define test cases with different tensors std::vector test_cases = { at::rand({3, 4, 5}), // standard 3D tensor at::rand({0, 4, 5}), // empty tensor at::rand({1}), // single-element tensor at::rand({2, 3, 4, 5, 6}), // high-dimensional tensor at::rand({3, 1, 5}) // tensor with one dimension as 1 }; // Iterate over each test case for (const auto& tensor : test_cases) { for (int64_t dim = 0; dim < tensor.dim(); ++dim) { std::vector inputs = {tensor, dim}; testStaticKernelEquality(graph, inputs); } } } TEST(StaticKernelTest, BucketizeTensor) { const std::string graph = R"(graph(%input, %boundaries, %out_int32, %right): %out = torch.ops.aten.bucketize.Tensor(self=%input, boundaries=%boundaries, out_int32=%out_int32, right=%right) return (%out) )"; std::vector> test_cases = { {false, false}, {true, false}, {false, true}, {true, true}}; for (const auto& [out_int32, right] : test_cases) { at::Tensor input = at::tensor({0.1, 2.5, 3.0, 4.5, 5.0}, at::kFloat); at::Tensor boundaries = at::tensor({1.0, 2.0, 3.0, 4.0}, at::kFloat); std::vector args = {input, boundaries, out_int32, right}; testStaticKernelEquality(graph, args); } } TEST(StaticKernelTest, SliceScatter) { const std::string graph = R"(graph(%self, %src, %dim, %start, %end, %step): %out = torch.ops.aten.slice_scatter.default(self=%self, src=%src, dim=%dim, start=%start, end=%end, step=%step) return (%out) )"; // Create input tensors at::Tensor self = at::rand({5, 5}, at::kFloat); at::Tensor src = at::rand({2, 5}, at::kFloat); int64_t dim = 0; int64_t start = 1; int64_t end = 3; int64_t step = 1; // Create a vector of IValues to pass as inputs std::vector inputs = {self, src, dim, start, end, step}; // Run the kernel and verify the output testStaticKernelEquality(graph, inputs); } TEST(StaticKernelTest, QuantizedEmbeddingBagBytePrepack) { const std::string graph = R"( graph(%input): %weight = torch.ops.quantized.embedding_bag_byte_prepack.default(weight=%input) %res = torch.ops.aten.clone.default(self=%weight, memory_format=None) return (%res) )"; at::Tensor args1 = torch::randn({8, 16}, at::ScalarType::Float); testStaticKernelEquality(graph, {args1}); } TEST(StaticKernelTest, QuantizedEmbeddingBagByteUnpack) { const std::string graph = R"( graph(%input): %weight = torch.ops.quantized.embedding_bag_byte_prepack.default(weight=%input) %output = torch.ops.quantized.embedding_bag_byte_unpack.default(weight=%weight) %res = torch.ops.aten.clone.default(self=%output, memory_format=None) return (%res) )"; at::Tensor args1 = torch::randn({8, 16}, at::ScalarType::Float); testStaticKernelEquality(graph, {args1}); } TEST(StaticKernelTest, QuantizedLinear) { const std::string graph = R"( graph(%input, %weights): %packed_params = torch.ops.quantized.linear_prepack.default(W=%weights, B=None) %1254 = torch.ops.quantized.linear.default(X=%input, W_prepack=%packed_params, Y_scale_i=1.0, Y_zero_point_i=1) %res = torch.ops.aten.dequantize.self(self=%1254) return (%res) )"; at::Tensor input = at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQUInt8); at::Tensor weight = at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQInt8); testStaticKernelEquality(graph, {input, weight}); } TEST(NativeKernelTest, View) { const std::string source = R"(graph(%self): %ret = torch.ops.aten.view.default(self=%self, size=[36]) %cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None) return (%cloned) )"; auto self0 = at::rand({6, 6}); std::vector args{self0}; testStaticKernelEquality(source, args, true); } TEST(NativeKernelTest, Permute) { const std::string source = R"(graph(%self): %ret = torch.ops.aten.permute.default(self=%self, dims=[1, 0]) %cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None) return (%cloned) )"; auto self0 = at::rand({2, 3}); std::vector args{self0}; testStaticKernelEquality(source, args, true); } TEST(NativeKernelTest, Reshape) { const std::string source = R"(graph(%self): %ret = torch.ops.aten.reshape.default(self=%self, shape=[9, 4]) %cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None) return (%cloned) )"; auto self0 = at::rand({3, 3, 4}); std::vector args{self0}; testStaticKernelEquality(source, args, true); } TEST(NativeKernelTest, Select) { static constexpr std::string_view source = R"(graph(%self): %ret = torch.ops.aten.select.int(self=%self, dim=1, index=0) %cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None) return (%cloned) )"; auto self0 = at::rand({3, 3, 3}); std::vector args{self0}; testStaticKernelEquality(source, args, true); } TEST(NativeKernelTest, Slice) { const std::string graph = R"(graph(%self): %ret = torch.ops.aten.slice.Tensor(self=%self, dim=0, start=1, end=3, step=1) %cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None) return (%cloned) )"; auto self0 = at::rand({5, 5}); std::vector args{self0}; testStaticKernelEquality(graph, args, true); } TEST(NativeKernelTest, Split) { const std::string graph = R"(graph(%self): %ret = torch.ops.aten.split.Tensor(self=%self, split_size=2, dim=0) return (%ret) )"; auto self0 = at::rand({6, 6}); std::vector args{self0}; testStaticKernelEquality(graph, args, true); } TEST(NativeKernelTest, SplitWithSizes) { const std::string graph = R"(graph(%self): %ret = torch.ops.aten.split_with_sizes.default(self=%self, split_sizes=[2, 4], dim=0) return (%ret) )"; auto self0 = at::rand({6, 6}); std::vector args{self0}; testStaticKernelEquality(graph, args, true); } TEST(NativeKernelTest, TensorSplitSections) { const std::string graph = R"(graph(%self): %ret = torch.ops.aten.tensor_split.sections(self=%self, sections=3, dim=0) return (%ret) )"; auto self0 = at::rand({9, 3}); std::vector args{self0}; testStaticKernelEquality(graph, args, true); } TEST(StaticKernelTest, Stack) { const std::string graph = R"(graph(%tensors): %ret = torch.ops.aten.stack.default(tensors=%tensors, dim=0) return (%ret) )"; auto tensor1 = at::rand({2, 3}); auto tensor2 = at::rand({2, 3}); auto tensor3 = at::rand({2, 3}); std::vector args{ std::vector{tensor1, tensor2, tensor3}}; testStaticKernelEquality(graph, args, true); } TEST(NativeKernelTest, Item) { const std::string graph = R"(graph(%self): %ret = torch.ops.aten.item.default(self=%self) return (%ret) )"; auto self0 = at::tensor({42.0}); std::vector args{self0}; testStaticKernelEquality(graph, args, true); } TEST(NativeKernelTest, Narrow) { const std::string graph = R"(graph(%self, %dim, %start, %length): %ret = torch.ops.aten.narrow.default(self=%self, dim=%dim, start=%start, length=%length) %cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None) return (%cloned) )"; auto self = at::rand({5, 5}); int64_t dim = 1; int64_t start = 1; int64_t length = 3; std::vector args{self, dim, start, length}; testStaticKernelEquality(graph, args, true); } } // namespace torch::nativert