Fix clang-tidy warnings on c10/xpu files (#169231)

This PR fixes clang-tidy warnings on c10/xpu files.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/169231
Approved by: https://github.com/guangyey, https://github.com/EikanWang, https://github.com/albanD
This commit is contained in:
Yuanyuan Chen
2025-12-11 09:48:53 +00:00
committed by PyTorch MergeBot
parent b870068d3d
commit 28fe2d3dbd
5 changed files with 44 additions and 39 deletions

View File

@@ -34,6 +34,7 @@ struct BlockPool {
std::set<Block*, Comparison> blocks;
std::set<Block*, Comparison> unmapped;
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
const bool is_small;
PrivatePool* owner_PrivatePool;
};
@@ -63,7 +64,6 @@ struct Block {
void* ptr)
: device(device),
queue(queue),
stream_uses(),
size(size),
requested_size(0),
pool(pool),
@@ -71,11 +71,7 @@ struct Block {
// constructor for search key
Block(DeviceIndex device, sycl::queue* queue, size_t size)
: device(device),
queue(queue),
stream_uses(),
size(size),
requested_size(0) {}
: device(device), queue(queue), size(size), requested_size(0) {}
bool is_split() const {
return (prev != nullptr) || (next != nullptr);
@@ -142,7 +138,8 @@ struct ExpandableSegment {
// The extra 1/8 allows flexibility for remapping or moving pages within the
// segment when unmapping earlier regions.
constexpr float kVirtualMemOversubscriptFactor = 1.125f; // 1 + 1/8
max_handles_ = numSegments(device_total * kVirtualMemOversubscriptFactor);
max_handles_ = numSegments(static_cast<size_t>(
static_cast<float>(device_total) * kVirtualMemOversubscriptFactor));
ptr_ = sycl::ext::oneapi::experimental::reserve_virtual_mem(
segment_size_ * max_handles_, xpu::get_device_context());
}
@@ -168,15 +165,16 @@ struct ExpandableSegment {
// Allocate and map physical memory for each segment.
for (const auto i : c10::irange(begin, end)) {
TORCH_INTERNAL_ASSERT(!handles_.at(i));
auto& handle = handles_.at(i);
try {
// Allocate physical memory for each segment. Construct the physical_mem
// in-place to avoid copies.
handles_.at(i).emplace(
auto& mem = handle.emplace(
xpu::get_raw_device(device_),
xpu::get_device_context(),
segment_size_);
// Map the allocated physical memory into the virtual address space.
handles_.at(i).value().map(
mem.map(
ptr_ + i * segment_size_,
segment_size_,
sycl::ext::oneapi::experimental::address_access_mode::read_write);
@@ -187,13 +185,14 @@ struct ExpandableSegment {
// Note: constructing physical_mem may over-subscribe device memory but
// not immediately trigger OOM. The actual OOM can occur during map().
// Roll back all segments allocated or mapped in this operation.
handles_.at(i) = std::nullopt;
handle.reset();
for (const auto j : c10::irange(begin, i)) {
sycl::ext::oneapi::experimental::unmap(
// NOLINTNEXTLINE(performance-no-int-to-ptr)
reinterpret_cast<void*>(ptr_ + segment_size_ * j),
segment_size_,
xpu::get_device_context());
handles_.at(j) = std::nullopt;
handles_.at(j).reset();
}
trimHandles();
return rangeFromHandles(begin, begin);
@@ -245,6 +244,7 @@ struct ExpandableSegment {
// ranges. Users must explicitly call unmap on all ranges before
// destroying the physical_mem object.
sycl::ext::oneapi::experimental::unmap(
// NOLINTNEXTLINE(performance-no-int-to-ptr)
reinterpret_cast<void*>(ptr_ + segment_size_ * i),
segment_size_,
xpu::get_device_context());
@@ -318,9 +318,9 @@ struct ExpandableSegment {
size_t max_handles_{0};
// Physical memory handles for the segments.
std::vector<std::optional<sycl::ext::oneapi::experimental::physical_mem>>
handles_{};
handles_;
// Peer devices on which this memory could be accessible, reserved.
std::vector<c10::DeviceIndex> peers_{};
std::vector<c10::DeviceIndex> peers_;
};
struct AllocParams {
@@ -330,10 +330,7 @@ struct AllocParams {
sycl::queue* queue,
BlockPool* pool,
size_t alloc_size)
: search_key(device, queue, size),
pool(pool),
alloc_size(alloc_size),
block(nullptr) {}
: search_key(device, queue, size), pool(pool), alloc_size(alloc_size) {}
DeviceIndex device() const {
return search_key.device;
@@ -350,7 +347,7 @@ struct AllocParams {
Block search_key;
BlockPool* pool;
size_t alloc_size;
Block* block;
Block* block{nullptr};
StatTypes stat_types = {};
};
@@ -987,7 +984,7 @@ class DeviceCachingAllocator {
}
Block* alloc_found_block(
AllocParams params,
const AllocParams& params,
size_t orig_size,
bool split_remainder) {
auto size = params.size();
@@ -1151,7 +1148,7 @@ class DeviceCachingAllocator {
" Please use `empty_cache` to release all unoccupied cached memory.");
}
bool split_remainder = should_split(params.block, params.size());
return alloc_found_block(std::move(params), orig_size, split_remainder);
return alloc_found_block(params, orig_size, split_remainder);
}
void free(Block* block) {
@@ -1254,7 +1251,8 @@ class DeviceCachingAllocator {
const auto device_total =
xpu::get_raw_device(device_index)
.get_info<sycl::info::device::global_mem_size>();
allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
allowed_memory_maximum =
static_cast<size_t>(fraction * static_cast<double>(device_total));
set_fraction = true;
}

View File

@@ -5,18 +5,19 @@
namespace c10::xpu {
static inline sycl::async_handler asyncHandler = [](sycl::exception_list el) {
if (el.size() == 0) {
return;
}
for (const auto& e : el) {
try {
std::rethrow_exception(e);
} catch (sycl::exception& e) {
TORCH_WARN("SYCL Exception: ", e.what());
}
}
throw;
};
static inline sycl::async_handler asyncHandler =
[](const sycl::exception_list& el) {
if (el.size() == 0) {
return;
}
for (const auto& e : el) {
try {
std::rethrow_exception(e);
} catch (sycl::exception& e) {
TORCH_WARN("SYCL Exception: ", e.what());
}
}
throw;
};
} // namespace c10::xpu

View File

@@ -5,7 +5,6 @@
#include <atomic>
#include <deque>
#include <mutex>
#include <vector>
namespace c10::xpu {
@@ -30,6 +29,7 @@ std::deque<
std::array<std::atomic<uint32_t>, max_compile_time_stream_priorities>>
priority_counters;
// NOLINTNEXTLINE(*c-arrays)
thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
/*
@@ -174,6 +174,7 @@ void initXPUStreamsOnce() {
// Inits current streams (thread local) to the last queue in the "normal
// priority" queue pool. Note: the queue pool have not been initialized yet.
// It will be initialized in initDeviceStreamState for the specified device.
// NOLINTNEXTLINE(*c-arrays)
current_streams = std::make_unique<StreamId[]>(num_gpus);
for (const auto i : c10::irange(num_gpus)) {
// Assigning the current stream to the last one in the pool can be
@@ -238,9 +239,11 @@ sycl::queue& XPUStream::queue() const {
switch (st) {
case StreamIdType::NORMAL:
case StreamIdType::HIGH:
// NOLINTNEXTLINE(performance-no-int-to-ptr)
return *streams[device_index][static_cast<uint8_t>(st)][si];
// See Note [External XPU Stream]
case StreamIdType::EXT:
// NOLINTNEXTLINE(performance-no-int-to-ptr)
return *(reinterpret_cast<sycl::queue*>(stream_id));
default:
TORCH_CHECK(

View File

@@ -44,7 +44,7 @@ class C10_XPU_API XPUStream {
}
/// Construct a XPUStream from a Stream with no error checking.
explicit XPUStream(Unchecked, Stream stream) : stream_(stream) {}
explicit XPUStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {}
bool operator==(const XPUStream& other) const noexcept {
return unwrap() == other.unwrap();

View File

@@ -20,7 +20,7 @@ TEST(XPUStreamTest, CopyAndMoveTest) {
return;
}
int32_t device = -1;
c10::DeviceIndex device = -1;
sycl::queue queue;
c10::xpu::XPUStream copyStream = c10::xpu::getStreamFromPool();
{
@@ -119,8 +119,10 @@ TEST(XPUStreamTest, MultithreadStreamBehavior) {
c10::xpu::XPUStream cur_stream = c10::xpu::getCurrentXPUStream();
EXPECT_NE(cur_stream, *s0);
EXPECT_NE(cur_stream, *s1);
EXPECT_TRUE(s0);
EXPECT_TRUE(s1);
EXPECT_NE(cur_stream, s0);
EXPECT_NE(cur_stream, s1);
EXPECT_NE(s0, s1);
}
@@ -167,6 +169,7 @@ TEST(XPUStreamTest, StreamFunction) {
}
constexpr int numel = 1024;
// NOLINTNEXTLINE(*-c-arrays)
int hostData[numel];
initHostData(hostData, numel);