diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp index 92dffc91539..8bbb6836f8f 100644 --- a/c10/xpu/XPUCachingAllocator.cpp +++ b/c10/xpu/XPUCachingAllocator.cpp @@ -34,6 +34,7 @@ struct BlockPool { std::set blocks; std::set unmapped; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) const bool is_small; PrivatePool* owner_PrivatePool; }; @@ -63,7 +64,6 @@ struct Block { void* ptr) : device(device), queue(queue), - stream_uses(), size(size), requested_size(0), pool(pool), @@ -71,11 +71,7 @@ struct Block { // constructor for search key Block(DeviceIndex device, sycl::queue* queue, size_t size) - : device(device), - queue(queue), - stream_uses(), - size(size), - requested_size(0) {} + : device(device), queue(queue), size(size), requested_size(0) {} bool is_split() const { return (prev != nullptr) || (next != nullptr); @@ -142,7 +138,8 @@ struct ExpandableSegment { // The extra 1/8 allows flexibility for remapping or moving pages within the // segment when unmapping earlier regions. constexpr float kVirtualMemOversubscriptFactor = 1.125f; // 1 + 1/8 - max_handles_ = numSegments(device_total * kVirtualMemOversubscriptFactor); + max_handles_ = numSegments(static_cast( + static_cast(device_total) * kVirtualMemOversubscriptFactor)); ptr_ = sycl::ext::oneapi::experimental::reserve_virtual_mem( segment_size_ * max_handles_, xpu::get_device_context()); } @@ -168,15 +165,16 @@ struct ExpandableSegment { // Allocate and map physical memory for each segment. for (const auto i : c10::irange(begin, end)) { TORCH_INTERNAL_ASSERT(!handles_.at(i)); + auto& handle = handles_.at(i); try { // Allocate physical memory for each segment. Construct the physical_mem // in-place to avoid copies. - handles_.at(i).emplace( + auto& mem = handle.emplace( xpu::get_raw_device(device_), xpu::get_device_context(), segment_size_); // Map the allocated physical memory into the virtual address space. - handles_.at(i).value().map( + mem.map( ptr_ + i * segment_size_, segment_size_, sycl::ext::oneapi::experimental::address_access_mode::read_write); @@ -187,13 +185,14 @@ struct ExpandableSegment { // Note: constructing physical_mem may over-subscribe device memory but // not immediately trigger OOM. The actual OOM can occur during map(). // Roll back all segments allocated or mapped in this operation. - handles_.at(i) = std::nullopt; + handle.reset(); for (const auto j : c10::irange(begin, i)) { sycl::ext::oneapi::experimental::unmap( + // NOLINTNEXTLINE(performance-no-int-to-ptr) reinterpret_cast(ptr_ + segment_size_ * j), segment_size_, xpu::get_device_context()); - handles_.at(j) = std::nullopt; + handles_.at(j).reset(); } trimHandles(); return rangeFromHandles(begin, begin); @@ -245,6 +244,7 @@ struct ExpandableSegment { // ranges. Users must explicitly call unmap on all ranges before // destroying the physical_mem object. sycl::ext::oneapi::experimental::unmap( + // NOLINTNEXTLINE(performance-no-int-to-ptr) reinterpret_cast(ptr_ + segment_size_ * i), segment_size_, xpu::get_device_context()); @@ -318,9 +318,9 @@ struct ExpandableSegment { size_t max_handles_{0}; // Physical memory handles for the segments. std::vector> - handles_{}; + handles_; // Peer devices on which this memory could be accessible, reserved. - std::vector peers_{}; + std::vector peers_; }; struct AllocParams { @@ -330,10 +330,7 @@ struct AllocParams { sycl::queue* queue, BlockPool* pool, size_t alloc_size) - : search_key(device, queue, size), - pool(pool), - alloc_size(alloc_size), - block(nullptr) {} + : search_key(device, queue, size), pool(pool), alloc_size(alloc_size) {} DeviceIndex device() const { return search_key.device; @@ -350,7 +347,7 @@ struct AllocParams { Block search_key; BlockPool* pool; size_t alloc_size; - Block* block; + Block* block{nullptr}; StatTypes stat_types = {}; }; @@ -987,7 +984,7 @@ class DeviceCachingAllocator { } Block* alloc_found_block( - AllocParams params, + const AllocParams& params, size_t orig_size, bool split_remainder) { auto size = params.size(); @@ -1151,7 +1148,7 @@ class DeviceCachingAllocator { " Please use `empty_cache` to release all unoccupied cached memory."); } bool split_remainder = should_split(params.block, params.size()); - return alloc_found_block(std::move(params), orig_size, split_remainder); + return alloc_found_block(params, orig_size, split_remainder); } void free(Block* block) { @@ -1254,7 +1251,8 @@ class DeviceCachingAllocator { const auto device_total = xpu::get_raw_device(device_index) .get_info(); - allowed_memory_maximum = static_cast(fraction * device_total); + allowed_memory_maximum = + static_cast(fraction * static_cast(device_total)); set_fraction = true; } diff --git a/c10/xpu/XPUException.h b/c10/xpu/XPUException.h index 9bc64ec3f39..4a71b52aa77 100644 --- a/c10/xpu/XPUException.h +++ b/c10/xpu/XPUException.h @@ -5,18 +5,19 @@ namespace c10::xpu { -static inline sycl::async_handler asyncHandler = [](sycl::exception_list el) { - if (el.size() == 0) { - return; - } - for (const auto& e : el) { - try { - std::rethrow_exception(e); - } catch (sycl::exception& e) { - TORCH_WARN("SYCL Exception: ", e.what()); - } - } - throw; -}; +static inline sycl::async_handler asyncHandler = + [](const sycl::exception_list& el) { + if (el.size() == 0) { + return; + } + for (const auto& e : el) { + try { + std::rethrow_exception(e); + } catch (sycl::exception& e) { + TORCH_WARN("SYCL Exception: ", e.what()); + } + } + throw; + }; } // namespace c10::xpu diff --git a/c10/xpu/XPUStream.cpp b/c10/xpu/XPUStream.cpp index baf44ff11cb..894de11ca2c 100644 --- a/c10/xpu/XPUStream.cpp +++ b/c10/xpu/XPUStream.cpp @@ -5,7 +5,6 @@ #include #include -#include #include namespace c10::xpu { @@ -30,6 +29,7 @@ std::deque< std::array, max_compile_time_stream_priorities>> priority_counters; +// NOLINTNEXTLINE(*c-arrays) thread_local std::unique_ptr current_streams = nullptr; /* @@ -174,6 +174,7 @@ void initXPUStreamsOnce() { // Inits current streams (thread local) to the last queue in the "normal // priority" queue pool. Note: the queue pool have not been initialized yet. // It will be initialized in initDeviceStreamState for the specified device. + // NOLINTNEXTLINE(*c-arrays) current_streams = std::make_unique(num_gpus); for (const auto i : c10::irange(num_gpus)) { // Assigning the current stream to the last one in the pool can be @@ -238,9 +239,11 @@ sycl::queue& XPUStream::queue() const { switch (st) { case StreamIdType::NORMAL: case StreamIdType::HIGH: + // NOLINTNEXTLINE(performance-no-int-to-ptr) return *streams[device_index][static_cast(st)][si]; // See Note [External XPU Stream] case StreamIdType::EXT: + // NOLINTNEXTLINE(performance-no-int-to-ptr) return *(reinterpret_cast(stream_id)); default: TORCH_CHECK( diff --git a/c10/xpu/XPUStream.h b/c10/xpu/XPUStream.h index fea64d7c109..c922759c2c4 100644 --- a/c10/xpu/XPUStream.h +++ b/c10/xpu/XPUStream.h @@ -44,7 +44,7 @@ class C10_XPU_API XPUStream { } /// Construct a XPUStream from a Stream with no error checking. - explicit XPUStream(Unchecked, Stream stream) : stream_(stream) {} + explicit XPUStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {} bool operator==(const XPUStream& other) const noexcept { return unwrap() == other.unwrap(); diff --git a/c10/xpu/test/impl/XPUStreamTest.cpp b/c10/xpu/test/impl/XPUStreamTest.cpp index 661022dbe18..5d71ae4c0b6 100644 --- a/c10/xpu/test/impl/XPUStreamTest.cpp +++ b/c10/xpu/test/impl/XPUStreamTest.cpp @@ -20,7 +20,7 @@ TEST(XPUStreamTest, CopyAndMoveTest) { return; } - int32_t device = -1; + c10::DeviceIndex device = -1; sycl::queue queue; c10::xpu::XPUStream copyStream = c10::xpu::getStreamFromPool(); { @@ -119,8 +119,10 @@ TEST(XPUStreamTest, MultithreadStreamBehavior) { c10::xpu::XPUStream cur_stream = c10::xpu::getCurrentXPUStream(); - EXPECT_NE(cur_stream, *s0); - EXPECT_NE(cur_stream, *s1); + EXPECT_TRUE(s0); + EXPECT_TRUE(s1); + EXPECT_NE(cur_stream, s0); + EXPECT_NE(cur_stream, s1); EXPECT_NE(s0, s1); } @@ -167,6 +169,7 @@ TEST(XPUStreamTest, StreamFunction) { } constexpr int numel = 1024; + // NOLINTNEXTLINE(*-c-arrays) int hostData[numel]; initHostData(hostData, numel);