diff --git a/.lintrunner.toml b/.lintrunner.toml
index 1725d427639..a75bd930aea 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -226,7 +226,7 @@ command = [
 [[linter]]
 code = 'CLANGTIDY'
 include_patterns = [
-    'c10/core/**/*.cpp',
+    'c10/**/*.cpp',
     'torch/csrc/fx/**/*.cpp',
     'torch/csrc/generic/**/*.cpp',
     'torch/csrc/onnx/**/*.cpp',
@@ -239,6 +239,7 @@ exclude_patterns = [
     # FunctionsManual.cpp is excluded to keep this diff clean. It will be fixed
     # in a follow up PR.
     # that are not easily converted to accepted c++
+    'c10/cuda/**/*.cpp',
     'c10/test/**/*.cpp',
     'torch/csrc/jit/passes/onnx/helper.cpp',
     'torch/csrc/jit/passes/onnx/shape_type_inference.cpp',
diff --git a/c10/benchmark/intrusive_ptr_benchmark.cpp b/c10/benchmark/intrusive_ptr_benchmark.cpp
index 7d22f0870a1..d0bdb91a250 100644
--- a/c10/benchmark/intrusive_ptr_benchmark.cpp
+++ b/c10/benchmark/intrusive_ptr_benchmark.cpp
@@ -7,7 +7,6 @@
 using c10::intrusive_ptr;
 using c10::intrusive_ptr_target;
 using c10::make_intrusive;
-using c10::weak_intrusive_ptr;
 
 namespace {
 
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 261708a2df3..4db02bf0c2b 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -213,7 +213,7 @@ is_non_overlapping_and_dense
  * backend.
  **/
 struct C10_API BackendMeta : intrusive_ptr_target {
-  virtual ~BackendMeta(){};
+  ~BackendMeta() override = default;
   virtual intrusive_ptr<BackendMeta> clone(
       const intrusive_ptr<BackendMeta>& ptr) const {
     return ptr;
@@ -263,7 +263,7 @@ struct C10_API ExtraMeta {
       c10::optional<std::string> custom_data_ptr_error_msg_ = c10::nullopt)
       : symbolic_shape_meta_(std::move(symbolic_shape_meta)),
         named_tensor_meta_(std::move(named_tensor_meta)),
-        backend_meta_(backend_meta) {}
+        backend_meta_(std::move(backend_meta)) {}
 
   std::unique_ptr<ExtraMeta> clone() const {
     return std::make_unique<ExtraMeta>(*this);
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index f036e0c7fad..5ce29cdeb7c 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -20,6 +20,7 @@
 #include <cuda_runtime_api.h>
 #include <algorithm>
 #include <bitset>
+#include <cstddef>
 #include <cstdint>
 #include <deque>
 #include <iostream>
@@ -362,14 +363,14 @@ struct ExpandableSegment {
       int device,
       cudaStream_t stream,
       size_t size,
-      const std::vector<int>& peers)
+      std::vector<int> peers)
       : device_(device),
         stream_(stream),
         max_handles_(0),
         // 2MB for small pool, 20MB for large pool
         segment_size_(size),
-        peers_(peers) {
-    cudaDeviceProp prop;
+        peers_(std::move(peers)) {
+    cudaDeviceProp prop{};
     C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_));
     // we allocate enough address space for 1 1/8 the total memory on the GPU.
     // This allows for some cases where we have to unmap pages earlier in the
@@ -390,11 +391,11 @@ struct ExpandableSegment {
       return rangeFromHandles(begin, end);
     }
     while (end > handles_.size()) {
-      handles_.push_back(c10::nullopt);
+      handles_.emplace_back(c10::nullopt);
     }
     for (auto i : c10::irange(begin, end)) {
       TORCH_INTERNAL_ASSERT(!handles_.at(i));
-      CUmemGenericAllocationHandle handle;
+      CUmemGenericAllocationHandle handle = 0;
       CUmemAllocationProp prop = {};
       prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
       prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
@@ -523,7 +524,7 @@ struct ExpandableSegment {
   }
   int device_;
   cudaStream_t stream_;
-  CUdeviceptr ptr_;
+  CUdeviceptr ptr_{};
   size_t max_handles_;
   size_t segment_size_;
   std::vector<c10::optional<CUmemGenericAllocationHandle>> handles_;
@@ -561,7 +562,7 @@ struct ExpandableSegment {
 // [Checkpointing PrivatePoolState]
 struct BlockState {
   int device = 0;
-  cudaStream_t stream = 0;
+  cudaStream_t stream = nullptr;
   stream_set stream_uses = {};
   size_t size = 0;
   void* ptr = nullptr;
@@ -715,8 +716,8 @@ struct PrivatePool {
   PrivatePool()
       : use_count(1),
         cudaMalloc_count(0),
-        large_blocks(/*is_small=*/false, this),
-        small_blocks(/*is_small=*/true, this) {}
+        large_blocks(/*small=*/false, this),
+        small_blocks(/*small=*/true, this) {}
   PrivatePool(const PrivatePool&) = delete;
   PrivatePool(PrivatePool&&) = delete;
   PrivatePool& operator=(const PrivatePool&) = delete;
@@ -759,7 +760,7 @@ SegmentState::SegmentState(Block* head) {
 PrivatePoolState::PrivatePoolState(
     MempoolId_t pool_id,
     const std::vector<Block*>& private_pool_head_blocks)
-    : owner_id(pool_id) {
+    : owner_id(std::move(pool_id)) {
   for (Block* head : private_pool_head_blocks) {
     segments.emplace_back(head);
   }
@@ -891,7 +892,7 @@ void CachingAllocatorConfig::lexArgs(
   size_t env_length = strlen(env);
   for (size_t i = 0; i < env_length; i++) {
     if (env[i] == ',' || env[i] == ':' || env[i] == '[' || env[i] == ']') {
-      if (buf.size() != 0) {
+      if (!buf.empty()) {
         config.emplace_back(buf.begin(), buf.end());
         buf.clear();
       }
@@ -964,7 +965,7 @@ size_t CachingAllocatorConfig::parseRoundUpPower2Divisions(
     if (config[i].compare("[") == 0) {
       size_t last_index = 0;
       while (++i < config.size() && config[i].compare("]") != 0) {
-        std::string val1 = config[i];
+        const std::string& val1 = config[i];
         size_t val2 = 0;
 
         consumeToken(config, ++i, ':');
@@ -1048,7 +1049,7 @@ size_t CachingAllocatorConfig::parseAllocatorConfig(
     used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
     if (used_cudaMallocAsync) {
 #if CUDA_VERSION >= 11040
-      int version;
+      int version = 0;
       C10_CUDA_CHECK(cudaDriverGetVersion(&version));
       TORCH_CHECK(
           version >= 11040,
@@ -1131,7 +1132,7 @@ static std::string reportProcessMemoryInfo(int device) {
     TORCH_INTERNAL_ASSERT(NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_());
   });
 
-  cudaDeviceProp prop;
+  cudaDeviceProp prop{};
   C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
 
   char pci_id[80];
@@ -1143,7 +1144,7 @@ static std::string reportProcessMemoryInfo(int device) {
       prop.pciBusID,
       prop.pciDeviceID);
 
-  nvmlDevice_t nvml_device;
+  nvmlDevice_t nvml_device = nullptr;
   TORCH_INTERNAL_ASSERT(
       NVML_SUCCESS ==
       DriverAPI::get()->nvmlDeviceGetHandleByPciBusId_v2_(
@@ -1250,8 +1251,8 @@ class DeviceCachingAllocator {
 
  public:
   DeviceCachingAllocator()
-      : large_blocks(/*is_small=*/false),
-        small_blocks(/*is_small=*/true),
+      : large_blocks(/*small=*/false),
+        small_blocks(/*small=*/true),
         alloc_trace(new std::vector<TraceEntry>()) {
     stats.max_split_size = CachingAllocatorConfig::max_split_size();
     context_recorder_.store(nullptr);
@@ -1280,7 +1281,7 @@ class DeviceCachingAllocator {
       const std::unordered_set<void*>& expected_live_allocations) {
     std::unique_lock<std::recursive_mutex> lock(mutex);
 
-    PrivatePool* pool;
+    PrivatePool* pool = nullptr;
     auto pool_it = graph_pools.find(mempool_id);
     TORCH_CHECK(pool_it != graph_pools.end(), "Could not find pool of id");
     pool = pool_it->second.get();
@@ -1370,8 +1371,8 @@ class DeviceCachingAllocator {
       // alloc_block should have thrown an exception already.
       TORCH_INTERNAL_ASSERT(params.err == cudaErrorMemoryAllocation);
 
-      size_t device_free;
-      size_t device_total;
+      size_t device_free = 0;
+      size_t device_total = 0;
       C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
       std::string allowed_info;
 
@@ -1660,8 +1661,8 @@ class DeviceCachingAllocator {
 
   /** set memory fraction to limit maximum allocated memory **/
   void setMemoryFraction(double fraction) {
-    size_t device_free;
-    size_t device_total;
+    size_t device_free = 0;
+    size_t device_total = 0;
     C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
     allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
     set_fraction = true;
@@ -1678,7 +1679,7 @@ class DeviceCachingAllocator {
     std::lock_guard<std::recursive_mutex> lock(mutex);
     if (*largest ==
         0) { // make an initial guess if a zero *largest is passed in
-      size_t tmp_bytes;
+      size_t tmp_bytes = 0;
       C10_CUDA_CHECK(cudaMemGetInfo(
           largest, // Use free memory as an optimistic initial guess of *largest
           &tmp_bytes));
@@ -2038,7 +2039,7 @@ class DeviceCachingAllocator {
         });
 
     if (record_history) {
-      record_trace(TraceEntry::SNAPSHOT, 0, total_active, 0, nullptr);
+      record_trace(TraceEntry::SNAPSHOT, 0, total_active, nullptr, nullptr);
     }
     return result;
   }
@@ -2665,7 +2666,7 @@ class DeviceCachingAllocator {
     C10_CUDA_CHECK(cudaGetLastError());
 
     size_t size = p.alloc_size;
-    void* ptr;
+    void* ptr = nullptr;
 
     if (isRetry) {
       stats.num_alloc_retries += 1;
@@ -2977,7 +2978,7 @@ class DeviceCachingAllocator {
   }
 
   void insert_events(Block* block) {
-    int prev_device;
+    int prev_device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&prev_device));
 
     stream_set streams(std::move(block->stream_uses));
@@ -2997,7 +2998,7 @@ class DeviceCachingAllocator {
   }
 
   void insert_events_deferred_until_no_capture() {
-    if (C10_UNLIKELY(needs_events_deferred_until_no_capture.size() > 0)) {
+    if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
       for (auto* block : needs_events_deferred_until_no_capture) {
         TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
         insert_events(block);
@@ -3140,7 +3141,7 @@ class NativeCachingAllocator : public CUDAAllocator {
   }
 
   bool initialized() override {
-    return device_allocator.size() > 0;
+    return !device_allocator.empty();
   }
 
   /** allocates a block which is safe to use from the provided stream */
@@ -3196,17 +3197,17 @@ class NativeCachingAllocator : public CUDAAllocator {
       CreateContextFn context_recorder,
       size_t alloc_trace_max_entries,
       bool alloc_trace_record_context) override {
-    int device;
+    int device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     device_allocator[device]->recordHistory(
         enabled,
-        std::move(context_recorder),
+        context_recorder,
         alloc_trace_max_entries,
         alloc_trace_record_context);
   }
 
   bool isHistoryEnabled() override {
-    int device;
+    int device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     return device_allocator[device]->isHistoryEnabled();
   }
@@ -3220,7 +3221,7 @@ class NativeCachingAllocator : public CUDAAllocator {
   }
 
   void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override {
-    int device;
+    int device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     device_allocator[device]->attachOutOfMemoryObserver(std::move(observer));
   }
@@ -3319,7 +3320,7 @@ class NativeCachingAllocator : public CUDAAllocator {
         OutOfMemoryError,
         size < one_exa_bytes,
         "CUDA out of memory. Tried to allocate more than 1EB memory.");
-    int device;
+    int device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     if (forceUncachedAllocator()) {
@@ -3396,7 +3397,7 @@ class NativeCachingAllocator : public CUDAAllocator {
     if (nbytes == 0) {
       return nullptr;
     }
-    int device;
+    int device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
@@ -3407,7 +3408,7 @@ class NativeCachingAllocator : public CUDAAllocator {
     if (nbytes == 0) {
       return nullptr;
     }
-    int device;
+    int device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     malloc(&r, device, nbytes, stream);
@@ -3484,7 +3485,7 @@ class NativeCachingAllocator : public CUDAAllocator {
     C10_CUDA_CHECK(cudaIpcOpenMemHandle(
         &dev, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
     // devPtr has to be deleted in same device when created.
-    int curr_device;
+    int curr_device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&curr_device));
     auto sp =
         std::shared_ptr<void>(dev, [handle, curr_device, this](void* ptr) {
@@ -3590,7 +3591,7 @@ struct BackendStaticInitializer {
   }
 };
 
-std::atomic<CUDAAllocator*> allocator{};
+std::atomic<CUDAAllocator*> allocator;
 BackendStaticInitializer backend_static_initializer;
 
 } // namespace CUDACachingAllocator
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index abbd9aa2ab3..26afdbb0d72 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -14,7 +14,7 @@ int32_t driver_version() {
 }
 
 int device_count_impl(bool fail_if_no_driver) {
-  int count;
+  int count = 0;
   auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDeviceCount(&count));
   if (err == cudaSuccess) {
     return count;
@@ -121,7 +121,7 @@ DeviceIndex device_count_ensure_non_zero() {
 }
 
 DeviceIndex current_device() {
-  int cur_device;
+  int cur_device = 0;
   C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device));
   return static_cast<DeviceIndex>(cur_device);
 }
diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
index e93a04d5c80..ebaddb4e8f0 100644
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -58,7 +58,7 @@ struct PtrUsage {
   // recorded_streams holds side usage streams added by record_stream calls.
   // In other words, it does NOT include the original creation stream.
   ska::flat_hash_set<UsageStream, UsageStreamHash> recorded_streams;
-  UsageStream creation_stream;
+  UsageStream creation_stream{};
   uint64_t size;
   bool captured;
   PtrUsage(uint64_t s, bool c) : size(s), captured(c) {}
@@ -152,7 +152,7 @@ inline void lazy_init_device(int device) {
 
     // See "Retaining memory in the pool" here:
     // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-1/
-    cudaMemPool_t mempool;
+    cudaMemPool_t mempool = nullptr;
     C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, device));
     uint64_t threshold = UINT64_MAX;
     C10_CUDA_CHECK(cudaMemPoolSetAttribute(
@@ -183,7 +183,7 @@ inline void lazy_init_device(int device) {
 
 inline void sync_raw(cudaStream_t dependency, cudaStream_t dependent) {
   // CUDACachingAllocator.cpp uses raw cuda events, as do we.
-  cudaEvent_t event;
+  cudaEvent_t event = nullptr;
   C10_CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
   C10_CUDA_CHECK(cudaEventRecord(event, dependency));
   C10_CUDA_CHECK(cudaStreamWaitEvent(dependent, event));
@@ -331,7 +331,7 @@ void mallocAsync(void** devPtr, int device, size_t size, cudaStream_t stream) {
   std::lock_guard<std::mutex> lk(general_mutex);
 
   if (!capture_underway &&
-      ungraphed_ptrs_defer_free_until_no_capture.size() > 0) {
+      !ungraphed_ptrs_defer_free_until_no_capture.empty()) {
     // See Note [Avoid freeing uncaptured ptrs during CUDA graph capture]
     for (const auto ptr : ungraphed_ptrs_defer_free_until_no_capture) {
       auto it = ptr_info.find(ptr);
@@ -363,8 +363,8 @@ void mallocAsync(void** devPtr, int device, size_t size, cudaStream_t stream) {
     // allocation. This aligns with the behavior of alloc_block in
     // CUDACachingAllocator.cpp.
     (void)cudaGetLastError(); // clear CUDA error
-    size_t device_free;
-    size_t device_total;
+    size_t device_free = 0;
+    size_t device_total = 0;
     C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
     TORCH_CHECK_WITH(
         OutOfMemoryError,
@@ -410,7 +410,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
         OutOfMemoryError,
         size < one_exa_bytes,
         "CUDA out of memory. Tried to allocate more than 1EB memory.");
-    int device;
+    int device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     if (size != 0) {
@@ -442,7 +442,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
   }
 
   bool initialized() override {
-    return devs_initialized_flags.size() > 0;
+    return !devs_initialized_flags.empty();
   }
 
   static inline void assertValidDevice(int device) {
@@ -466,8 +466,8 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     // TORCH_CHECK(devs_initialized_flags[device], ...)?
     lazy_init_device(device);
 
-    size_t device_free;
-    size_t device_total;
+    size_t device_free = 0;
+    size_t device_total = 0;
     C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
     pytorch_memory_limits[device] =
         static_cast<uint64_t>(fraction * device_total);
@@ -481,14 +481,14 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     // introduces performance nondeterminism.
   }
 
-  void emptyCache(void) override {
+  void emptyCache() override {
     std::lock_guard<std::mutex> lk(general_mutex);
 
     for (int dev = 0; dev < device_count; dev++) {
       if (devs_initialized_flags[dev]) {
         CUDAGuard g(dev);
 
-        cudaMemPool_t mempool;
+        cudaMemPool_t mempool = nullptr;
         cudaDeviceGetDefaultMemPool(&mempool, dev);
         cudaDeviceSynchronize();
         cudaMemPoolTrimTo(mempool, 0);
@@ -533,8 +533,8 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     CUDAGuard g(device);
     lazy_init_device(device);
 
-    size_t free_upper_bound;
-    size_t device_total;
+    size_t free_upper_bound = 0;
+    size_t device_total = 0;
     C10_CUDA_CHECK(cudaMemGetInfo(&free_upper_bound, &device_total));
     TORCH_INTERNAL_ASSERT(
         free_upper_bound + pytorch_used_bytes[device] <= device_total);
@@ -542,7 +542,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
         free_upper_bound,
         pytorch_memory_limits[device] - pytorch_used_bytes[device]);
     auto stream = c10::cuda::getCurrentCUDAStream();
-    void* dummy;
+    void* dummy = nullptr;
 
     // Defensively checks for preexisting CUDA error state.
     auto err = cudaGetLastError();
@@ -668,7 +668,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     if (devs_initialized_flags[device]) {
       CUDAGuard g(device);
 
-      cudaMemPool_t mempool;
+      cudaMemPool_t mempool = nullptr;
       C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, device));
       C10_CUDA_CHECK(cudaMemPoolGetAttribute(
           mempool, cudaMemPoolAttrReservedMemCurrent, &reserved_mem_current));
@@ -725,7 +725,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     assertValidDevice(device);
 
     CUDAGuard g(device);
-    cudaMemPool_t mempool;
+    cudaMemPool_t mempool = nullptr;
     C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, device));
     // Using zero as the reset value is the method recommended by Cuda driver
     // team. Vivek Kini says:
@@ -783,7 +783,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
       CUDAGuard g(free_stream.device);
 
       // CUDACachingAllocator.cpp uses raw cuda events, as do we.
-      cudaEvent_t event;
+      cudaEvent_t event = nullptr;
       C10_CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
       C10_CUDA_CHECK(cudaEventRecord(event, free_stream.stream));
       C10_CUDA_CHECK(cudaStreamWaitEvent(capture_stream.stream(), event));
@@ -817,7 +817,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     if (nbytes == 0) {
       return nullptr;
     }
-    int device;
+    int device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     mallocAsync(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
@@ -828,7 +828,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     if (nbytes == 0) {
       return nullptr;
     }
-    int device;
+    int device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     mallocAsync(&r, device, nbytes, stream);
@@ -843,7 +843,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     // cudaDeviceEnablePeerAccess. We need pool-specific enablement. See
     // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-2/
     c10::cuda::CUDAGuard device_guard(dev);
-    cudaMemPool_t mempool;
+    cudaMemPool_t mempool = nullptr;
     C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, dev_to_access));
     cudaMemAccessDesc desc = {};
     desc.location.type = cudaMemLocationTypeDevice;
@@ -851,7 +851,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     desc.flags = cudaMemAccessFlagsProtReadWrite;
     C10_CUDA_CHECK(cudaMemPoolSetAccess(mempool, &desc, 1 /* numDescs */));
   }
-  virtual cudaError_t memcpyAsync(
+  cudaError_t memcpyAsync(
       void* dst,
       int dstDevice,
       const void* src,
diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp
index fa992610afa..24a7175ec12 100644
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@@ -14,7 +14,7 @@ DriverAPI create_driver_api() {
 
   C10_FORALL_DRIVER_LIBRARIES(OPEN_LIBRARIES)
 #undef OPEN_LIBRARIES
-  DriverAPI r;
+  DriverAPI r{};
 
 #define LOOKUP_ENTRY(name, n)                              \
   r.name##_ = ((decltype(&name))dlsym(handle_##n, #name)); \
diff --git a/c10/cuda/impl/CUDATest.cpp b/c10/cuda/impl/CUDATest.cpp
index c5d9e3f1bf2..f2d4ae97651 100644
--- a/c10/cuda/impl/CUDATest.cpp
+++ b/c10/cuda/impl/CUDATest.cpp
@@ -10,7 +10,7 @@ namespace cuda {
 namespace impl {
 
 bool has_cuda_gpu() {
-  int count;
+  int count = 0;
   C10_CUDA_IGNORE_ERROR(cudaGetDeviceCount(&count));
 
   return count != 0;
diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp
index 8dff28f0e97..ff8e1d6ccc9 100644
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@@ -22,9 +22,8 @@ C10_DEFINE_bool(
 namespace c10 {
 
 namespace {
-// NOLINTNEXTLINE(modernize-redundant-void-arg)
-std::function<string(void)>* GetFetchStackTrace() {
-  static std::function<string(void)> func = []() {
+std::function<string()>* GetFetchStackTrace() {
+  static std::function<string()> func = []() {
     return get_backtrace(/*frames_to_skip=*/1);
   };
   return &func;
diff --git a/c10/util/SmallVector.cpp b/c10/util/SmallVector.cpp
index ddece6801f6..14b2fa9eb67 100644
--- a/c10/util/SmallVector.cpp
+++ b/c10/util/SmallVector.cpp
@@ -112,6 +112,7 @@ void* SmallVectorBase<Size_T>::mallocForGrow(
     size_t TSize,
     size_t& NewCapacity) {
   NewCapacity = getNewCapacity<Size_T>(MinSize, TSize, this->capacity());
+  // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
   auto Result = std::malloc(NewCapacity * TSize);
   if (Result == nullptr) {
     throw std::bad_alloc();
@@ -128,6 +129,7 @@ void SmallVectorBase<Size_T>::grow_pod(
   size_t NewCapacity = getNewCapacity<Size_T>(MinSize, TSize, this->capacity());
   void* NewElts = nullptr;
   if (BeginX == FirstEl) {
+    // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
     NewElts = std::malloc(NewCapacity * TSize);
     if (NewElts == nullptr) {
       throw std::bad_alloc();
@@ -137,6 +139,7 @@ void SmallVectorBase<Size_T>::grow_pod(
     memcpy(NewElts, this->BeginX, size() * TSize);
   } else {
     // If this wasn't grown from the inline copy, grow the allocated space.
+    // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
     NewElts = std::realloc(this->BeginX, NewCapacity * TSize);
     if (NewElts == nullptr) {
       throw std::bad_alloc();
diff --git a/c10/util/TypeList.h b/c10/util/TypeList.h
index d923137bf7d..6857623a4e7 100644
--- a/c10/util/TypeList.h
+++ b/c10/util/TypeList.h
@@ -338,9 +338,7 @@ struct last<typelist<Head>> final {
 };
 template <class TypeList>
 using last_t = typename last<TypeList>::type;
-static_assert(
-    std::is_same<int, last_t<typelist<double, float, int>>>::value,
-    "");
+static_assert(std::is_same<int, last_t<typelist<double, float, int>>>::value);
 
 /**
  * Take/drop a number of arguments from a typelist.
diff --git a/c10/util/int128.cpp b/c10/util/int128.cpp
index 0486f1c7bd9..b074c05698e 100644
--- a/c10/util/int128.cpp
+++ b/c10/util/int128.cpp
@@ -57,13 +57,14 @@ const uint128_pod kuint128max = {
   } while (0)
 static inline int Fls64(uint64_t n) {
   //   GOOGLE_DCHECK_NE(0, n);
-  int pos = 0;
+  uint64_t pos = 0;
   STEP(uint64_t, n, pos, 0x20);
   uint32_t n32 = n;
   STEP(uint32_t, n32, pos, 0x10);
   STEP(uint32_t, n32, pos, 0x08);
   STEP(uint32_t, n32, pos, 0x04);
-  return pos + ((uint64_t{0x3333333322221100u} >> (n32 << 2)) & 0x3);
+  return static_cast<int>(
+      pos + ((uint64_t{0x3333333322221100u} >> (n32 << 2)) & 0x3));
 }
 #undef STEP
 
@@ -128,7 +129,7 @@ std::ostream& operator<<(std::ostream& o, const uint128& b) {
 
   // Select a divisor which is the largest power of the base < 2^64.
   uint128 div;
-  std::streamsize div_base_log = 0;
+  int div_base_log = 0;
   switch (flags & std::ios::basefield) {
     case std::ios::hex:
       div = (uint64_t)0x1000000000000000u; // 16^15
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 3233c4857cb..88babf7d0fc 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -50,6 +50,7 @@ struct DontIncreaseRefcount {};
 // tells us if the object was allocated by us.  If it wasn't, no
 // intrusive_ptr for you!
 
+// NOLINTNEXTLINE(cppcoreguidelines-virtual-class-destructor)
 class C10_API intrusive_ptr_target {
   // Note [Weak references for intrusive refcounting]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/c10/util/numa.cpp b/c10/util/numa.cpp
index 6612d6a4dba..7d87a9a2e6d 100644
--- a/c10/util/numa.cpp
+++ b/c10/util/numa.cpp
@@ -47,6 +47,7 @@ int GetNUMANode(const void* ptr) {
   AT_ASSERT(ptr);
 
   int numa_node = -1;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   TORCH_CHECK(
       get_mempolicy(
           &numa_node,
@@ -78,12 +79,15 @@ void NUMAMove(void* ptr, size_t size, int numa_node_id) {
 
   uintptr_t page_start_ptr =
       ((reinterpret_cast<uintptr_t>(ptr)) & ~(getpagesize() - 1));
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions)
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions)
   ptrdiff_t offset = reinterpret_cast<uintptr_t>(ptr) - page_start_ptr;
   // Avoid extra dynamic allocation and NUMA api calls
   AT_ASSERT(
       numa_node_id >= 0 &&
       static_cast<unsigned>(numa_node_id) < sizeof(unsigned long) * 8);
   unsigned long mask = 1UL << numa_node_id;
+  // NOLINTNEXTLINE(performance-no-int-to-ptr)
   TORCH_CHECK(
       mbind(
           reinterpret_cast<void*>(page_start_ptr),
diff --git a/c10/util/reverse_iterator.h b/c10/util/reverse_iterator.h
index 16d6db3fc47..5951adfdb41 100644
--- a/c10/util/reverse_iterator.h
+++ b/c10/util/reverse_iterator.h
@@ -84,7 +84,7 @@ class reverse_iterator {
 
   constexpr reverse_iterator& operator=(const reverse_iterator& rhs) noexcept {
     current = rhs.current;
-    return current;
+    return *this;
   }
 
   template <typename _Iter>
diff --git a/c10/util/signal_handler.cpp b/c10/util/signal_handler.cpp
index 41def36b90a..dd27b737283 100644
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@@ -112,6 +112,7 @@ FatalSignalHandler::FatalSignalHandler()
       writingCond(PTHREAD_COND_INITIALIZER),
       writingMutex(PTHREAD_MUTEX_INITIALIZER) {}
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
 FatalSignalHandler::signal_handler FatalSignalHandler::kSignalHandlers[] = {
     {"SIGABRT", SIGABRT, {}},
     {"SIGINT", SIGINT, {}},
@@ -159,7 +160,7 @@ void FatalSignalHandler::stacktraceSignalHandler(bool needsLock) {
   if (needsLock) {
     pthread_mutex_lock(&writingMutex);
   }
-  pid_t tid = syscall(SYS_gettid);
+  pid_t tid = static_cast<pid_t>(syscall(SYS_gettid));
   std::string backtrace = fmt::format(
       "{}({}), PID: {}, Thread {}: \n {}",
       fatalSignalName,
@@ -201,7 +202,7 @@ void FatalSignalHandler::fatalSignalHandler(int signum) {
   DIR* procDir = opendir("/proc/self/task");
   if (procDir) {
     pid_t pid = getpid();
-    pid_t currentTid = syscall(SYS_gettid);
+    pid_t currentTid = static_cast<pid_t>(syscall(SYS_gettid));
     struct dirent* entry = nullptr;
     pthread_mutex_lock(&writingMutex);
     while ((entry = readdir(procDir)) != nullptr) {