mirror of
https://github.com/zebrajr/pytorch.git
synced 2026-01-15 12:15:51 +00:00
Fix clang-tidy warnings on c10/xpu files (#169231)
This PR fixes clang-tidy warnings on c10/xpu files. Pull Request resolved: https://github.com/pytorch/pytorch/pull/169231 Approved by: https://github.com/guangyey, https://github.com/EikanWang, https://github.com/albanD
This commit is contained in:
committed by
PyTorch MergeBot
parent
b870068d3d
commit
28fe2d3dbd
@@ -34,6 +34,7 @@ struct BlockPool {
|
||||
|
||||
std::set<Block*, Comparison> blocks;
|
||||
std::set<Block*, Comparison> unmapped;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
|
||||
const bool is_small;
|
||||
PrivatePool* owner_PrivatePool;
|
||||
};
|
||||
@@ -63,7 +64,6 @@ struct Block {
|
||||
void* ptr)
|
||||
: device(device),
|
||||
queue(queue),
|
||||
stream_uses(),
|
||||
size(size),
|
||||
requested_size(0),
|
||||
pool(pool),
|
||||
@@ -71,11 +71,7 @@ struct Block {
|
||||
|
||||
// constructor for search key
|
||||
Block(DeviceIndex device, sycl::queue* queue, size_t size)
|
||||
: device(device),
|
||||
queue(queue),
|
||||
stream_uses(),
|
||||
size(size),
|
||||
requested_size(0) {}
|
||||
: device(device), queue(queue), size(size), requested_size(0) {}
|
||||
|
||||
bool is_split() const {
|
||||
return (prev != nullptr) || (next != nullptr);
|
||||
@@ -142,7 +138,8 @@ struct ExpandableSegment {
|
||||
// The extra 1/8 allows flexibility for remapping or moving pages within the
|
||||
// segment when unmapping earlier regions.
|
||||
constexpr float kVirtualMemOversubscriptFactor = 1.125f; // 1 + 1/8
|
||||
max_handles_ = numSegments(device_total * kVirtualMemOversubscriptFactor);
|
||||
max_handles_ = numSegments(static_cast<size_t>(
|
||||
static_cast<float>(device_total) * kVirtualMemOversubscriptFactor));
|
||||
ptr_ = sycl::ext::oneapi::experimental::reserve_virtual_mem(
|
||||
segment_size_ * max_handles_, xpu::get_device_context());
|
||||
}
|
||||
@@ -168,15 +165,16 @@ struct ExpandableSegment {
|
||||
// Allocate and map physical memory for each segment.
|
||||
for (const auto i : c10::irange(begin, end)) {
|
||||
TORCH_INTERNAL_ASSERT(!handles_.at(i));
|
||||
auto& handle = handles_.at(i);
|
||||
try {
|
||||
// Allocate physical memory for each segment. Construct the physical_mem
|
||||
// in-place to avoid copies.
|
||||
handles_.at(i).emplace(
|
||||
auto& mem = handle.emplace(
|
||||
xpu::get_raw_device(device_),
|
||||
xpu::get_device_context(),
|
||||
segment_size_);
|
||||
// Map the allocated physical memory into the virtual address space.
|
||||
handles_.at(i).value().map(
|
||||
mem.map(
|
||||
ptr_ + i * segment_size_,
|
||||
segment_size_,
|
||||
sycl::ext::oneapi::experimental::address_access_mode::read_write);
|
||||
@@ -187,13 +185,14 @@ struct ExpandableSegment {
|
||||
// Note: constructing physical_mem may over-subscribe device memory but
|
||||
// not immediately trigger OOM. The actual OOM can occur during map().
|
||||
// Roll back all segments allocated or mapped in this operation.
|
||||
handles_.at(i) = std::nullopt;
|
||||
handle.reset();
|
||||
for (const auto j : c10::irange(begin, i)) {
|
||||
sycl::ext::oneapi::experimental::unmap(
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
reinterpret_cast<void*>(ptr_ + segment_size_ * j),
|
||||
segment_size_,
|
||||
xpu::get_device_context());
|
||||
handles_.at(j) = std::nullopt;
|
||||
handles_.at(j).reset();
|
||||
}
|
||||
trimHandles();
|
||||
return rangeFromHandles(begin, begin);
|
||||
@@ -245,6 +244,7 @@ struct ExpandableSegment {
|
||||
// ranges. Users must explicitly call unmap on all ranges before
|
||||
// destroying the physical_mem object.
|
||||
sycl::ext::oneapi::experimental::unmap(
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
reinterpret_cast<void*>(ptr_ + segment_size_ * i),
|
||||
segment_size_,
|
||||
xpu::get_device_context());
|
||||
@@ -318,9 +318,9 @@ struct ExpandableSegment {
|
||||
size_t max_handles_{0};
|
||||
// Physical memory handles for the segments.
|
||||
std::vector<std::optional<sycl::ext::oneapi::experimental::physical_mem>>
|
||||
handles_{};
|
||||
handles_;
|
||||
// Peer devices on which this memory could be accessible, reserved.
|
||||
std::vector<c10::DeviceIndex> peers_{};
|
||||
std::vector<c10::DeviceIndex> peers_;
|
||||
};
|
||||
|
||||
struct AllocParams {
|
||||
@@ -330,10 +330,7 @@ struct AllocParams {
|
||||
sycl::queue* queue,
|
||||
BlockPool* pool,
|
||||
size_t alloc_size)
|
||||
: search_key(device, queue, size),
|
||||
pool(pool),
|
||||
alloc_size(alloc_size),
|
||||
block(nullptr) {}
|
||||
: search_key(device, queue, size), pool(pool), alloc_size(alloc_size) {}
|
||||
|
||||
DeviceIndex device() const {
|
||||
return search_key.device;
|
||||
@@ -350,7 +347,7 @@ struct AllocParams {
|
||||
Block search_key;
|
||||
BlockPool* pool;
|
||||
size_t alloc_size;
|
||||
Block* block;
|
||||
Block* block{nullptr};
|
||||
StatTypes stat_types = {};
|
||||
};
|
||||
|
||||
@@ -987,7 +984,7 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
|
||||
Block* alloc_found_block(
|
||||
AllocParams params,
|
||||
const AllocParams& params,
|
||||
size_t orig_size,
|
||||
bool split_remainder) {
|
||||
auto size = params.size();
|
||||
@@ -1151,7 +1148,7 @@ class DeviceCachingAllocator {
|
||||
" Please use `empty_cache` to release all unoccupied cached memory.");
|
||||
}
|
||||
bool split_remainder = should_split(params.block, params.size());
|
||||
return alloc_found_block(std::move(params), orig_size, split_remainder);
|
||||
return alloc_found_block(params, orig_size, split_remainder);
|
||||
}
|
||||
|
||||
void free(Block* block) {
|
||||
@@ -1254,7 +1251,8 @@ class DeviceCachingAllocator {
|
||||
const auto device_total =
|
||||
xpu::get_raw_device(device_index)
|
||||
.get_info<sycl::info::device::global_mem_size>();
|
||||
allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
|
||||
allowed_memory_maximum =
|
||||
static_cast<size_t>(fraction * static_cast<double>(device_total));
|
||||
set_fraction = true;
|
||||
}
|
||||
|
||||
|
||||
@@ -5,18 +5,19 @@
|
||||
|
||||
namespace c10::xpu {
|
||||
|
||||
static inline sycl::async_handler asyncHandler = [](sycl::exception_list el) {
|
||||
if (el.size() == 0) {
|
||||
return;
|
||||
}
|
||||
for (const auto& e : el) {
|
||||
try {
|
||||
std::rethrow_exception(e);
|
||||
} catch (sycl::exception& e) {
|
||||
TORCH_WARN("SYCL Exception: ", e.what());
|
||||
}
|
||||
}
|
||||
throw;
|
||||
};
|
||||
static inline sycl::async_handler asyncHandler =
|
||||
[](const sycl::exception_list& el) {
|
||||
if (el.size() == 0) {
|
||||
return;
|
||||
}
|
||||
for (const auto& e : el) {
|
||||
try {
|
||||
std::rethrow_exception(e);
|
||||
} catch (sycl::exception& e) {
|
||||
TORCH_WARN("SYCL Exception: ", e.what());
|
||||
}
|
||||
}
|
||||
throw;
|
||||
};
|
||||
|
||||
} // namespace c10::xpu
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
|
||||
#include <atomic>
|
||||
#include <deque>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
|
||||
namespace c10::xpu {
|
||||
@@ -30,6 +29,7 @@ std::deque<
|
||||
std::array<std::atomic<uint32_t>, max_compile_time_stream_priorities>>
|
||||
priority_counters;
|
||||
|
||||
// NOLINTNEXTLINE(*c-arrays)
|
||||
thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
|
||||
|
||||
/*
|
||||
@@ -174,6 +174,7 @@ void initXPUStreamsOnce() {
|
||||
// Inits current streams (thread local) to the last queue in the "normal
|
||||
// priority" queue pool. Note: the queue pool have not been initialized yet.
|
||||
// It will be initialized in initDeviceStreamState for the specified device.
|
||||
// NOLINTNEXTLINE(*c-arrays)
|
||||
current_streams = std::make_unique<StreamId[]>(num_gpus);
|
||||
for (const auto i : c10::irange(num_gpus)) {
|
||||
// Assigning the current stream to the last one in the pool can be
|
||||
@@ -238,9 +239,11 @@ sycl::queue& XPUStream::queue() const {
|
||||
switch (st) {
|
||||
case StreamIdType::NORMAL:
|
||||
case StreamIdType::HIGH:
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return *streams[device_index][static_cast<uint8_t>(st)][si];
|
||||
// See Note [External XPU Stream]
|
||||
case StreamIdType::EXT:
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return *(reinterpret_cast<sycl::queue*>(stream_id));
|
||||
default:
|
||||
TORCH_CHECK(
|
||||
|
||||
@@ -44,7 +44,7 @@ class C10_XPU_API XPUStream {
|
||||
}
|
||||
|
||||
/// Construct a XPUStream from a Stream with no error checking.
|
||||
explicit XPUStream(Unchecked, Stream stream) : stream_(stream) {}
|
||||
explicit XPUStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {}
|
||||
|
||||
bool operator==(const XPUStream& other) const noexcept {
|
||||
return unwrap() == other.unwrap();
|
||||
|
||||
@@ -20,7 +20,7 @@ TEST(XPUStreamTest, CopyAndMoveTest) {
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t device = -1;
|
||||
c10::DeviceIndex device = -1;
|
||||
sycl::queue queue;
|
||||
c10::xpu::XPUStream copyStream = c10::xpu::getStreamFromPool();
|
||||
{
|
||||
@@ -119,8 +119,10 @@ TEST(XPUStreamTest, MultithreadStreamBehavior) {
|
||||
|
||||
c10::xpu::XPUStream cur_stream = c10::xpu::getCurrentXPUStream();
|
||||
|
||||
EXPECT_NE(cur_stream, *s0);
|
||||
EXPECT_NE(cur_stream, *s1);
|
||||
EXPECT_TRUE(s0);
|
||||
EXPECT_TRUE(s1);
|
||||
EXPECT_NE(cur_stream, s0);
|
||||
EXPECT_NE(cur_stream, s1);
|
||||
EXPECT_NE(s0, s1);
|
||||
}
|
||||
|
||||
@@ -167,6 +169,7 @@ TEST(XPUStreamTest, StreamFunction) {
|
||||
}
|
||||
|
||||
constexpr int numel = 1024;
|
||||
// NOLINTNEXTLINE(*-c-arrays)
|
||||
int hostData[numel];
|
||||
initHostData(hostData, numel);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user