diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3634fdd7da8..0c6df83c616 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -270,7 +270,7 @@ if (MSVC)
     endif()
 
     # /bigobj increases number of sections in .obj file, which is needed to link
-    # against libaries in Python 2.7 under Windows
+    # against libraries in Python 2.7 under Windows
     set(${flag_var} "${${flag_var}} /MP /bigobj")
   endforeach(flag_var)
 
diff --git a/CODEOWNERS b/CODEOWNERS
index 35bdeccfafa..a5bbe99d07c 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -10,7 +10,7 @@
 /test/test_c10d.py @pietern @mrshenli @zhaojuanmao
 /torch/utils/cpp_extension.py @goldsborough @fmassa @soumith @ezyang
 
-# Not there to stricly require the approval, but to be tagged as a reviewer
+# Not there to strictly require the approval, but to be tagged as a reviewer
 # on the PRs to push them into a high priority inbox.
 /torch/csrc/api/data/ @apaszke
 /torch/csrc/autograd/ @apaszke
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 4a8031b26e9..801f3905e1e 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -24,7 +24,7 @@ else()
   set(CAFFE2_STATIC_LINK_CUDA_INT 0)
 endif()
 CONFIGURE_FILE(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
-# TODO: Don't unconditionally generate CUDAConfig.h.in.  Unfortuantely,
+# TODO: Don't unconditionally generate CUDAConfig.h.in.  Unfortunately,
 # this file generates AT_ROCM_ENABLED() which is required by the miopen
 # files, which are compiled even if we are doing a vanilla CUDA build.
 # Once we properly split CUDA and HIP in ATen, we can remove this code.
diff --git a/aten/src/ATen/core/boxing/kernel_lambda.h b/aten/src/ATen/core/boxing/kernel_lambda.h
index 62105358406..6d84024c2e8 100644
--- a/aten/src/ATen/core/boxing/kernel_lambda.h
+++ b/aten/src/ATen/core/boxing/kernel_lambda.h
@@ -8,7 +8,7 @@ namespace c10 {
 namespace detail {
   // WrapRuntimeKernelFunctor: Wraps any runtime functor into a functor that
   // inherits from c10::OperatorKernel, so it can be used as a c10 kernel.
-  // This can, for example, be used for lamdas, functors or even function pointers.
+  // This can, for example, be used for lambdas, functors or even function pointers.
   // In the case of function pointers, since it is a runtime function pointer,
   // there is an overhead for calling it whenever the kernel is invoked.
   template<class FuncType, class ReturnType, class ParameterList> class WrapRuntimeKernelFunctor_ {};
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index 6ae21611bfa..82fc2606837 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -184,7 +184,7 @@ struct FunctionSchema {
   std::vector<Argument> returns_;
   // if true then this schema takes an arbitrary number of additional arguments
   // after the argument specified in arguments
-  // currently this is used primarily to represent 'primtive' operators whose
+  // currently this is used primarily to represent 'primitive' operators whose
   // arguments are not checked by schema
   bool is_vararg_;
   bool is_varret_;
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 4265a1f7e7e..48e4dc65fbb 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -1366,7 +1366,7 @@ struct getTypePtr_<at::optional<T>> final {
 } // namespace detail
 template <class T>
 inline TypePtr getTypePtr() {
-  // TODO: static_assert that a templated function exists, and throw a friendy
+  // TODO: static_assert that a templated function exists, and throw a friendly
   // error message if not
   return detail::getTypePtr_<T>::call();
 }
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index 3768d4369a5..1161a876cc8 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -84,7 +84,7 @@ public:
   // a constexpr variable if we never odr-use it.  But it seems that some
   // versions GCC/Clang have buggy determinations on whether or not an
   // identifier is odr-used or not, and in any case it's hard to tell if
-  // a variable is odr-used or not.  So best to just cut the probem at the root.
+  // a variable is odr-used or not.  So best to just cut the problem at the root.
   static constexpr int size() {
     return 32 / sizeof(T);
   }
diff --git a/aten/src/ATen/cuda/CUDAGenerator.cpp b/aten/src/ATen/cuda/CUDAGenerator.cpp
index 3ad10ff94c4..f328cfd8a7d 100644
--- a/aten/src/ATen/cuda/CUDAGenerator.cpp
+++ b/aten/src/ATen/cuda/CUDAGenerator.cpp
@@ -94,7 +94,7 @@ uint64_t CUDAGenerator::current_seed() const {
 }
 
 /**
- * Gets a nondeterminstic random number from /dev/urandom or time,
+ * Gets a nondeterministic random number from /dev/urandom or time,
  * seeds the CPUGenerator with it and then returns that number.
  * 
  * FIXME: You can move this function to Generator.cpp if the algorithm
diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
index b76f9328d3e..7f4fc3ed5f0 100644
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@@ -53,7 +53,7 @@ namespace at { namespace cuda {
 // NOTE [ ATen NVRTC Stub and HIP ]
 //
 // ATen's NVRTC stub library, caffe2_nvrtc, provides dynamic loading of both
-// NVRTC and driver APIs. While the former is not yet suppoted for HIP, the
+// NVRTC and driver APIs. While the former is not yet supported for HIP, the
 // later is supported and needed (e.g., in CUDAHooks::getDeviceWithPrimaryContext()
 // used by tensor.pin_memory()).
 //
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 5803b3e135f..c1ce3190678 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -76,7 +76,7 @@ public:
   T* desc() const { return desc_.get(); }
   T* desc() { return desc_.get(); }
 
-  // Use mut_desc() to access the underlying desciptor pointer
+  // Use mut_desc() to access the underlying descriptor pointer
   // if you intend to modify what it points to (e.g., using
   // cudnnSetFooDescriptor).  This will ensure that the descriptor
   // is initialized.  Code in this file will use this function.
diff --git a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
index 8ad73a9be70..4d0b6645ac9 100644
--- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
@@ -27,7 +27,7 @@ namespace c10 { namespace hip {
 // HIP occurs; instead, anywhere we see "CUDA", it actually means "HIP".
 // For example, when you use HIPified PyTorch, you say x.cuda() to
 // move a tensor onto ROCm device.  We call this situation "HIP
-// maquerading as CUDA".
+// masquerading as CUDA".
 //
 // This leads to a very awkward situation when we want to call c10_hip
 // code from PyTorch, since c10_hip is expecting things to be called
diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h
index f5b7d9c5511..fee22ba92d5 100644
--- a/aten/src/ATen/miopen/Descriptors.h
+++ b/aten/src/ATen/miopen/Descriptors.h
@@ -61,7 +61,7 @@ public:
   T* desc() const { return desc_.get(); }
   T* desc() { return desc_.get(); }
 
-  // Use mut_desc() to access the underlying desciptor pointer
+  // Use mut_desc() to access the underlying descriptor pointer
   // if you intend to modify what it points to (e.g., using
   // miopenSetFooDescriptor).  This will ensure that the descriptor
   // is initialized.  Code in this file will use this function.
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 110e7d8502a..aac337bb0c4 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -1104,7 +1104,7 @@ Tensor _lu_solve_helper_cpu(const Tensor& self, const Tensor& LU_data, const Ten
   return self_working_copy;
 }
 
-// Supports arbitrary batch dimensions for self and LU_data (implicity LU_pivots also)
+// Supports arbitrary batch dimensions for self and LU_data (implicitly LU_pivots also)
 Tensor lu_solve(const Tensor& self, const Tensor& LU_data, const Tensor& LU_pivots) {
   TORCH_CHECK(self.dim() >= 2,
               "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp
index 7616c25368b..48446a98559 100644
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@@ -59,7 +59,7 @@ static inline void multi_margin_loss_cpu_kernel(
   using accscalar_t = at::acc_type<scalar_t, false>;
 
   // dim() != 0 check is for 1d input which produces a scalar output (that
-  // cannot be handeld by TensorAccessor)
+  // cannot be handled by TensorAccessor)
   if (reduction == Reduction::None && output.dim() > 0) {
     auto output_acc = output.accessor<scalar_t, 1>();
     for (int64_t t = 0; t < nframe; t++) {
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index 031182c7d38..2d856003fe6 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -295,7 +295,7 @@ static std::vector<QuantizedCellParamsDynamic> gather_quantized_params_dynamic(
   }
   return result;
 #else // USE_FBGEMM
-  TORCH_INTERNAL_ASSERT(false, "Tried to use quantized RNN wihtout FBGEMM!")
+  TORCH_INTERNAL_ASSERT(false, "Tried to use quantized RNN without FBGEMM!")
 #endif // USE_FBGEMM
 }
 
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index 6f21f2d4c47..a9e629983dd 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -276,7 +276,7 @@ std::tuple<Tensor, Tensor> kthvalue(
   return at::kthvalue(self, k, dimname_to_position(self, dim), keepdim);
 }
 
-// this does not reduce to median with dim beause we don't want to copy twice
+// this does not reduce to median with dim because we don't want to copy twice
 Tensor median_cpu(const Tensor& self) {
   NoNamesGuard guard;
   TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index cf831e5b9e6..aa1f5bdf80b 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -618,7 +618,7 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
     self - sparse tensor, its shape is sizes = sparse_shape + dense_shape
       indices - 2-D tensor of indices, shape is (sparse_dims, nnz)
       values - (1+len(dense_shape))-D tensor of values, shape is (nnz,) + dense_shape
-    index_select(dim, index) returns a sparse tensor with the follwing data
+    index_select(dim, index) returns a sparse tensor with the following data
       new_sizes = sizes[:dim] + (n,) + sizes[dim+1:]
       new_indices - shape is (sparse_dims, new_nnz)
       new_values - shape is (new_nnz,) + dense_shape
diff --git a/aten/src/ATen/native/Unfold3d.cpp b/aten/src/ATen/native/Unfold3d.cpp
index 75331a6c86f..c20dc1a89fb 100644
--- a/aten/src/ATen/native/Unfold3d.cpp
+++ b/aten/src/ATen/native/Unfold3d.cpp
@@ -85,7 +85,7 @@ static void unfolded3d_copy(
     const int64_t input_hw = input_height * input_width;
     const int64_t input_dhw = input_hw * input_depth;
 
-    // the following variables are updated ouside the most inner loop
+    // the following variables are updated outside the most inner loop
     int64_t d = d_out * dT - pT + i;
     int64_t h = h_out * dH - pH + j;
     int64_t ofs = nip * input_dhw + d * input_hw + h * input_width;
diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index cf127c093b5..581fdd2dc03 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -28,7 +28,7 @@
  * are computed from the input and the output size;
  *
  *
- * When the scales are infered from the input and output sizes,
+ * When the scales are inferred from the input and output sizes,
  * we view each pixel as an area, idx + 0.5 as its center index.
  * Here is an example formula in 1D case.
  * if align_corners: center of two corner pixel areas are preserved,
diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
index 88a28318ca7..da6c27006b9 100644
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@@ -26,7 +26,7 @@ struct Dist {
   //     map :      This tells how to modify (a - b) to form the component that
   //                gets summed.
   //     red :      This tells how to sum the result of map up. This is
-  //                separate because the inf norm actuall uses max instead of
+  //                separate because the inf norm actually uses max instead of
   //                sum.
   //     finish :   This tells what to do with the aggregated value to compute
   //                the norm. Generally this is the result of val ^ (1 / p).
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
index 84b0f785bc8..fa3fc257ccf 100644
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -158,7 +158,7 @@ namespace at { namespace native { namespace {
  *      `apply_fn` will be called multiple times, and together cover the entire
  *      output spatial space.
  *
- *  Now you should be able tp understand everything about the implementaion of
+ *  Now you should be able tp understand everything about the implementation of
  *  2D forward kernel shown at the beginning of this note.
  *
  **/
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
index 73f32ad2c8e..fa94539f3ae 100644
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@@ -117,7 +117,7 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
   Device dst_device = iter.device(0);
   Device src_device = iter.device(1);
 
-  // Enable p2p access between devices. (No-op if it invovles the CPU)
+  // Enable p2p access between devices. (No-op if it involves the CPU)
   bool p2p_enabled = maybe_enable_p2p_access(dst_device, src_device);
 
   if (copy_requires_temporaries(iter, p2p_enabled)) {
diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu
index 07da2125378..83abb48fcee 100644
--- a/aten/src/ATen/native/cuda/GridSampler.cu
+++ b/aten/src/ATen/native/cuda/GridSampler.cu
@@ -364,7 +364,7 @@ namespace {
 
         // assuming grad_grid is contiguous
         // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
         //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
         scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
         gGrid_ptr_NHW[0] = gix_mult * gix;
@@ -383,7 +383,7 @@ namespace {
 
         // assuming grad_grid is contiguous
         // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
         //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
         scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
         gGrid_ptr_NHW[0] = static_cast<scalar_t>(0);
@@ -569,7 +569,7 @@ namespace {
 
         // assuming grad_grid is contiguous
         // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
         //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
         scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
         gGrid_ptr_NDHW[0] = gix_mult * gix;
@@ -591,7 +591,7 @@ namespace {
 
         // assuming grad_grid is contiguous
         // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
         //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
         scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
         gGrid_ptr_NDHW[0] = static_cast<scalar_t>(0);
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 59a4255dc36..35fdb15edbb 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -108,7 +108,7 @@ static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size,
 }
 
 static std::vector<int64_t> computeLinearStride(const Tensor & tensor) {
-  // computes the stride as if tensor were contigous
+  // computes the stride as if tensor were contiguous
   auto sizes = tensor.sizes();
   std::vector<int64_t> stride(tensor.dim());
   stride[tensor.dim() - 1] = 1;
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index 2ee25c31a3b..b2c2041015d 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -7,7 +7,7 @@
 //
 // The gpu_kernel_with_scalars generates specializations that support a
 // single scalar CPU argument, such as from `cuda_tensor + 5`. The CPU scalar
-// is lifted to a kernel paramter instead of copying to device memory.
+// is lifted to a kernel parameter instead of copying to device memory.
 // This should be  used in conjunction with TensorIterator::allow_cpu_scalars_,
 // which is the default for TensorIterator::binary_op. Otherwise, all inputs
 // and the output must be on the GPU.
diff --git a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
index 9be51dc18e4..d30cc3b0c58 100644
--- a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
+++ b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
@@ -51,7 +51,7 @@ __device__ __forceinline__ void warp_reduce(acc_t* sum) {
 // A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp.
 // This is important because it means only __shfl_ instructions are required for reductions.
 // Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
-// CUDA warp size is 32 for all existing GPU architecures, but there is no guarantee this will not change for future arch.
+// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch.
 // ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs.
 // is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed.
 // The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t.
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index 9cd92bbaac1..8d1c174aca8 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -200,7 +200,7 @@ __global__ void cunn_SpatialSoftMaxForward(
     for (uint32_t inner_index = blockIdx.y * blockDim.y + threadIdx.y; inner_index < inner_size; inner_index += blockDim.y * gridDim.y) {
       const uint32_t data_offset = outer_offset + inner_index;
       ////////////////////////////////////////////////////////////
-      // These two blocks are really eqivalent, but specializing on
+      // These two blocks are really equivalent, but specializing on
       // blockDim.x == 1 makes the kernel faster when it's unused.
       // I didn't want to thread an extra template parameter, and nvcc
       // seems to be smart enough to hoist the if outside of the loops.
diff --git a/aten/src/ATen/native/cuda/SortingKthValue.cu b/aten/src/ATen/native/cuda/SortingKthValue.cu
index a58727e13c1..0e29fca0902 100644
--- a/aten/src/ATen/native/cuda/SortingKthValue.cu
+++ b/aten/src/ATen/native/cuda/SortingKthValue.cu
@@ -177,7 +177,7 @@ void kthvalue_cuda_template(
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-// this does not reduce to median with dim beause we don't want to copy twice
+// this does not reduce to median with dim because we don't want to copy twice
 template <typename scalar_t>
 Tensor median_cuda_template(const Tensor& self) {
   TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
index 66df8a83713..ac47057e4e6 100644
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -211,7 +211,7 @@ inline int64_t resolve_root_int(
 //                       (row + 2f - 1)row <= 2x
 //                  row^2 + (2f-1)row - 2x <= 0.                            [3]
 //
-// Based on ineuqality [3], we have the following coefficients for formula of
+// Based on inequality [3], we have the following coefficients for formula of
 // root:
 //                               a = 1
 //                               b = 2f - 1
@@ -254,7 +254,7 @@ inline void get_coordinate_in_tril_trapezoid(
 //                       (-row + 2f + 1)row <= 2x
 //                   row^2 - (2f+1)row + 2x >= 0.                           [3]
 //
-// Based on ineuqality [3], we have the following coefficients for formula of
+// Based on inequality [3], we have the following coefficients for formula of
 // root:
 //                               a = 1
 //                               b = -1 - 2f
diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh
index ac45d0e7d4c..9744ba74c22 100644
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@@ -213,7 +213,7 @@ __device__ __forceinline__ static void upsample_increment_value_bounded(
     accscalar_t value) {
   int access_y = max(min(y, height - 1), 0);
   int access_x = max(min(x, width - 1), 0);
-  /* TODO: result here is trucated to scalar_t,
+  /* TODO: result here is truncated to scalar_t,
      check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912
    */
   gpuAtomicAdd(
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 2c8a9fba35a..11633ece96e 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -1119,7 +1119,7 @@ std::tuple<Tensor, Tensor> pack_hidden<std::tuple<Tensor, Tensor>>(const Tensor&
 struct DropoutState {
   // Both buffer and event are lazily instantiated when a dropout state is needed
   // for the first time. Note that in this case needed != used, as we don't need
-  // a bufer to e.g. run RNNs in test mode.
+  // a buffer to e.g. run RNNs in test mode.
   at::Tensor buffer;
   c10::optional<cuda::CUDAEvent> event;
   std::mutex mutex;
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index 336d6d7fae8..312f11e42bc 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -99,7 +99,7 @@ static inline void _fft_fill_with_conjugate_symmetry_slice(Tensor& output,
         //   1. if this dim idx becomes 1, will need to add (size - 1) * stride
         //   2. otherwise, will need to subtract stride
         if (from_slice_indices[d] == 0) {
-          // Substract. Carries over to previous dimension
+          // Subtract. Carries over to previous dimension
           from_slice_data -= output.stride(d);
         } else if (from_slice_indices[d] == 1) {
           // Dimension index becomes 1
@@ -107,7 +107,7 @@ static inline void _fft_fill_with_conjugate_symmetry_slice(Tensor& output,
           from_slice_data += (output.size(d) - 1) * output.stride(d);
           break;
         } else {
-          // Substract. Doesn't carry over to previous dimension
+          // Subtract. Doesn't carry over to previous dimension
           from_slice_data -= output.stride(d);
           break;
         }
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index 4ec30471a19..7750097eb22 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -43,7 +43,7 @@ using namespace mkldnn;
 
 namespace {
 // Helper function for getting an ideep tensor out of an aten Tensor.
-// Note in case the aten Tensor is a dense tensor, the retured ideep
+// Note in case the aten Tensor is a dense tensor, the returned ideep
 // tensor is just a view of the storage of the aten dense tensor, so
 // caller needs to make sure the aten dense tensor's lifetime is
 // longer than the ideep tensor.
diff --git a/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp b/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp
index c4baa29574d..f582725b6a3 100644
--- a/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp
+++ b/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp
@@ -23,7 +23,7 @@ inline int start_index(int out_idx, int out_len, int in_len) {
    * in_len: the dimension_size of input matrix
    * Basically, in_len / out_len gives the number of
    * elements in each average computation.
-   * This functin computes the start index on input matrix.
+   * This function computes the start index on input matrix.
    */
   return (int)std::floor((float)(out_idx * in_len) / out_len);
 }
diff --git a/aten/src/ATen/native/quantized/cpu/qclamp.cpp b/aten/src/ATen/native/quantized/cpu/qclamp.cpp
index 1c2751c83bf..9773aef7bac 100644
--- a/aten/src/ATen/native/quantized/cpu/qclamp.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qclamp.cpp
@@ -23,7 +23,7 @@ Tensor quantized_clamp_impl(
     qclamp_stub(qx.device().type(), qx, *min, *max, qy);
   } else {
     TORCH_CHECK(
-        false, "Both min and max should be specifed for quantized clamp!");
+        false, "Both min and max should be specified for quantized clamp!");
   }
   return qy;
 }
diff --git a/aten/src/ATen/native/quantized/cpu/qmul.cpp b/aten/src/ATen/native/quantized/cpu/qmul.cpp
index fa9788998c4..a17f6c69c5d 100644
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@@ -15,7 +15,7 @@ inline void check_inputs(const Tensor& qa, const Tensor& qb) {
   TORCH_CHECK(qa.qscheme() == kPerTensorAffine,
               "Only per tensor quantization is supported in Mul.");
   TORCH_CHECK(qa.qscheme() == qb.qscheme(),
-              "Both inputs to Mul must have the same quantization shceme.");
+              "Both inputs to Mul must have the same quantization scheme.");
   TORCH_CHECK(qa.numel() == qb.numel(),
               "Mul operands must be the same size!");
   TORCH_CHECK(qa.scalar_type() == qb.scalar_type(),
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-neon.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-neon.c
index 53677e7418c..729c7268154 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-neon.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-neon.c
@@ -63,7 +63,7 @@ void pytorch_qnnp_requantize_fp32__neon(
 
 #ifdef __aarch64__
     /*
-     * Leverage "Floating-point Convert to Signed integer, rouding to nearest
+     * Leverage "Floating-point Convert to Signed integer, rounding to nearest
      * with ties to even" instruction. This is an ARMv8 instruction (always
      * available in AArch64), which saturates result on overflow. We don't need
      * to specifically consider saturated results, they will be clamped at the
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-psimd.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-psimd.c
index 4c8e296a0d8..8de33720534 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-psimd.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-psimd.c
@@ -46,7 +46,7 @@ void pytorch_qnnp_requantize_fp32__psimd(
      * - Large int32_t values can't be exactly represented as FP32. We expect
      * that conversion instruction would round it to nearest FP32 value with
      * ties to even, but Clang documentation for __builtin_convertvector does
-     *   not guaratee that.
+     *   not guarantee that.
      * - Product of two FP32 values is generally not exactly representation as
      * an FP32 value, and will be rounded to nearest FP32 value with ties to
      * even.
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/precise-scalar.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/precise-scalar.c
index 5485832932b..352396b01a3 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/precise-scalar.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/precise-scalar.c
@@ -91,7 +91,7 @@ void pytorch_qnnp_requantize_precise__scalar_unsigned32(
      *
      * To avoid full 64-bit shift, we leverage the fact that shift >= 32, and do
      * it in two steps:
-     * - Shift by 32, which can be implemented by extacting the high 32-bit word
+     * - Shift by 32, which can be implemented by extracting the high 32-bit word
      * on 32-bit systems.
      * - Shift by (shift - 32), which can be implemented as a 32-bit shift of
      * high word of addition result.
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
index 89e1ee112f9..e587c1d55ea 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
@@ -11,7 +11,7 @@ struct QnnpackOperatorDeleter {
 };
 
 // PackedWeight struct for QNNPACK stores the original Weight and Bias as
-// QNNPACK currently does not support an unpack function. Possible optimiation -
+// QNNPACK currently does not support an unpack function. Possible optimization -
 // For PyTorch Mobile, once the model is scripted and serialized we don't need
 // to call unpack, so we can save some memory by checking for this case.
 // Input scale is set to null in pre-pack step. QNNPACK needs bias quantized with
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 2ebe633c269..1eb4794d3f6 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -61,7 +61,7 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT
   TORCH_CHECK(cuda::check_device({sparse_, r_, t, dense}));
 
   TORCH_CHECK(dense.dim() == 2, "addmm: 2D tensor expected, got ", dense.dim(), "D tensor");
-  TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " spase dims");
+  TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " sparse dims");
   // no need to check dense_dim because dense_dim + sparse_dim = dim
 
   // mxk * kxn = mxn
diff --git a/aten/src/TH/THGeneral.h.in b/aten/src/TH/THGeneral.h.in
index 9597581efcb..21118d46401 100644
--- a/aten/src/TH/THGeneral.h.in
+++ b/aten/src/TH/THGeneral.h.in
@@ -33,7 +33,7 @@
 // we should merge macros.
 #ifdef _WIN32
 #if !defined(AT_CORE_STATIC_WINDOWS)
-// TODO: unfiy the controlling macros.
+// TODO: unify the controlling macros.
 #if defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
 #define TH_CPP_API __declspec(dllexport)
 #else // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
diff --git a/aten/src/TH/THStorage.h b/aten/src/TH/THStorage.h
index 404d587529e..a98e1fe9a06 100644
--- a/aten/src/TH/THStorage.h
+++ b/aten/src/TH/THStorage.h
@@ -1,4 +1,4 @@
 #pragma once
 #include <TH/THStorageFunctions.h>
 
-// Compatability header. Use THStorageFunctions.h instead if you need this.
+// Compatibility header. Use THStorageFunctions.h instead if you need this.
diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp
index fcf8a583568..744dbe97961 100644
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@@ -42,7 +42,7 @@ inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) {
 
 // [NOTE: nDimension vs nDimensionLegacyNoScalars vs nDimensionLegacyAll]
 // nDimension                 corresponds to the "true" ATen dimension.
-// nDimensionLegacyNoScalars  correpsonds to the ATen dimension, except scalars are viewed as 1-dimensional tensors.
+// nDimensionLegacyNoScalars  corresponds to the ATen dimension, except scalars are viewed as 1-dimensional tensors.
 // nDimensionLegacyAll        corresponds to the ATen dimension, except scalars are viewed as 1-dimensional tensors
 //                            and tensors with a dimension of size zero are collapsed to 0-dimensional tensors.
 //
diff --git a/aten/src/TH/generic/THVectorDispatch.cpp b/aten/src/TH/generic/THVectorDispatch.cpp
index 9e88b839626..2b6abe83f53 100644
--- a/aten/src/TH/generic/THVectorDispatch.cpp
+++ b/aten/src/TH/generic/THVectorDispatch.cpp
@@ -197,7 +197,7 @@ void THVector_(normal_fill)(scalar_t *data,
 }
 
 /*
- * This struct's constructor initalizes the dispatch tables. It simply checks
+ * This struct's constructor initializes the dispatch tables. It simply checks
  * what SIMD extensions are available, and then walks the dispatch table
  * to choose the best function.
  * NOTE: As implemented, it will initialize the dispatch pointer to the first supported function.
diff --git a/aten/src/THC/THCIntegerDivider.cuh b/aten/src/THC/THCIntegerDivider.cuh
index 75c0a5079ff..0cf79536526 100644
--- a/aten/src/THC/THCIntegerDivider.cuh
+++ b/aten/src/THC/THCIntegerDivider.cuh
@@ -6,7 +6,7 @@
 #include <cuda_runtime.h>
 #endif
 
-// A utility class to implement integer division by muliplication, given a fixed
+// A utility class to implement integer division by multiplication, given a fixed
 // divisor.
 //
 // WARNING: The fast divider algorithm is only implemented for unsigned int;
diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
index 4bf80aba1aa..53c2839b79e 100644
--- a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
@@ -41,7 +41,7 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
   weight = THCTensor_(newContiguous)(state, weight);
   bias = bias ? THCTensor_(newContiguous)(state, bias) : bias;
 
-  // Following the behvaior of other THCUNN functions, we shape the output
+  // Following the behavior of other THCUNN functions, we shape the output
   // Tensor ourselves
 
   int batchSize = input->size(0);
diff --git a/benchmarks/framework_overhead_benchmark/C2Module.py b/benchmarks/framework_overhead_benchmark/C2Module.py
index 43f8f9f6a8c..acd2f2b13bf 100644
--- a/benchmarks/framework_overhead_benchmark/C2Module.py
+++ b/benchmarks/framework_overhead_benchmark/C2Module.py
@@ -14,7 +14,7 @@ class C2SimpleNet(object):
     """
     This module constructs a net with 'op_name' operator. The net consist
     a series of such operator.
-    It intializes the workspace with input blob equal to the number of parameters
+    It initializes the workspace with input blob equal to the number of parameters
     needed for the op.
     Provides forward method to run the net niter times.
     """
diff --git a/benchmarks/operator_benchmark/README.md b/benchmarks/operator_benchmark/README.md
index ac494f45e54..95e0e46bf79 100644
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@@ -37,7 +37,7 @@ List all the supported tests:
 $ python -m pt.add_test --list_tests
 ```
 
-Filter and run a test (use `add_M8_N16_K32` as an exapmle):
+Filter and run a test (use `add_M8_N16_K32` as an example):
 ```
 $ python -m pt.add_test --test_name add_K32_M8_N1
 --omp_num_threads 1 --mkl_num_threads 1
diff --git a/benchmarks/operator_benchmark/benchmark_caffe2.py b/benchmarks/operator_benchmark/benchmark_caffe2.py
index 3fbedac20b6..aadc591853b 100644
--- a/benchmarks/operator_benchmark/benchmark_caffe2.py
+++ b/benchmarks/operator_benchmark/benchmark_caffe2.py
@@ -145,7 +145,7 @@ OpMeta = namedtuple("OpMeta", "op_type num_inputs input_dims input_types \
 
 def generate_c2_test_from_ops(ops_metadata, bench_op, tags):
     """
-    This function is used to generate Caffe2 tests based on the meatdata
+    This function is used to generate Caffe2 tests based on the metadata
     of operators. The metadata includes seven fields which are 1) op_type:
     the name of the operator. 2) num_inputs: the number of input blobs.
     3) input_dims: a dictionary which includes the shapes of the input blobs.
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
index ad5a2045617..edf9bb14e82 100644
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@@ -93,7 +93,7 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
                 tags = attr["tags"]
                 continue
 
-            # if 'cuda' is sepcified in input shape but the testing machines doesn't
+            # if 'cuda' is specified in input shape but the testing machines doesn't
             # support, we will skip this input
             if 'cuda' in attr.values():
                 if not torch.cuda.is_available():
diff --git a/benchmarks/operator_benchmark/pt/qrnn_test.py b/benchmarks/operator_benchmark/pt/qrnn_test.py
index c3b14f76ed7..7dc7dc7e0c7 100644
--- a/benchmarks/operator_benchmark/pt/qrnn_test.py
+++ b/benchmarks/operator_benchmark/pt/qrnn_test.py
@@ -52,7 +52,7 @@ class LSTMBenchmark(op_bench.TorchBenchmarkBase):
 
         self.x = torch.randn(sequence_len,  # sequence length
                              batch_size,    # batch size
-                             I)             # Number of featues in X
+                             I)             # Number of features in X
         self.h = torch.randn(NL * (D + 1),  # layer_num * dir_num
                              batch_size,    # batch size
                              H)             # hidden size
diff --git a/binaries/convert_and_benchmark.cc b/binaries/convert_and_benchmark.cc
index f230f0f7b7c..06983bb8b81 100644
--- a/binaries/convert_and_benchmark.cc
+++ b/binaries/convert_and_benchmark.cc
@@ -62,7 +62,7 @@ C10_DEFINE_string(
     "Report the conversion stage time to screen. "
     "The format of the string is <type>|<identifier>. "
     "The valid type is 'json'. "
-    "The valid identifier is nothing or an identifer that prefix every line");
+    "The valid identifier is nothing or an identifier that prefix every line");
 C10_DEFINE_string(
     scale,
     "-1,-1",
diff --git a/binaries/convert_image_to_tensor.cc b/binaries/convert_image_to_tensor.cc
index 26397a1c14c..2cbfb0703b3 100644
--- a/binaries/convert_image_to_tensor.cc
+++ b/binaries/convert_image_to_tensor.cc
@@ -63,7 +63,7 @@ C10_DEFINE_string(
     "Report the conversion stage time to screen. "
     "The format of the string is <type>|<identifier>. "
     "The valid type is 'json'. "
-    "The valid identifier is nothing or an identifer that prefix every line");
+    "The valid identifier is nothing or an identifier that prefix every line");
 C10_DEFINE_string(
     scale,
     "-1,-1",
diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h
index 579ef00820b..871ddb38318 100644
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@@ -203,7 +203,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
     data_ptr_ = std::move(data_ptr);
     // NOTE: data_type might change and so it's also possible that capacity
     // might not be divisible by itemsize. There is no way for us to keep track
-    // of the exact capacity if we're not explicity storing is. More conrectely
+    // of the exact capacity if we're not explicitly storing is. More concretely
     // capacity() might not return the value that was set here, if itemsize does
     // not evenly divide it.
     numel_ = capacity / data_type_.itemsize();
diff --git a/c10/test/util/registry_test.cpp b/c10/test/util/registry_test.cpp
index f2c752acd40..915389dda58 100644
--- a/c10/test/util/registry_test.cpp
+++ b/c10/test/util/registry_test.cpp
@@ -5,7 +5,7 @@
 #include <c10/util/Registry.h>
 
 // Note: we use a different namespace to test if the macros defined in
-// Registry.h actuall works with a different namespace from c10.
+// Registry.h actually works with a different namespace from c10.
 namespace c10_test {
 
 class Foo {
diff --git a/caffe2/contrib/gloo/allreduce_ops.cc b/caffe2/contrib/gloo/allreduce_ops.cc
index 56ef1237043..f1c61c050f3 100644
--- a/caffe2/contrib/gloo/allreduce_ops.cc
+++ b/caffe2/contrib/gloo/allreduce_ops.cc
@@ -10,7 +10,7 @@
 
 namespace {
 /**
- * This is a helper function which attemtps to get a base value depending on the
+ * This is a helper function which attempts to get a base value depending on the
  * # of nodes. Larger the base the better performance (up to 4) is what we have
  * observed in gloo benchmarks. At the moment bcube works only if # nodes = base
  * ^ x. Where x is some constant. So, if # node don't match our expectation
diff --git a/caffe2/contrib/gloo/allreduce_ops.h b/caffe2/contrib/gloo/allreduce_ops.h
index 90970fdd737..f1662813442 100644
--- a/caffe2/contrib/gloo/allreduce_ops.h
+++ b/caffe2/contrib/gloo/allreduce_ops.h
@@ -63,7 +63,7 @@ class AllreduceOp final : public Operator<Context> {
     // Store which inputs/outputs this instance initialized with
     update(init_);
 
-    // Verify inputs == ouputs
+    // Verify inputs == outputs
     CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
     for (auto i = 0; i < init_.inputs.size(); i++) {
       CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
diff --git a/caffe2/contrib/gloo/allreduce_ops_gpu.cc b/caffe2/contrib/gloo/allreduce_ops_gpu.cc
index 73df62b5b4c..a4c90fdfc3d 100644
--- a/caffe2/contrib/gloo/allreduce_ops_gpu.cc
+++ b/caffe2/contrib/gloo/allreduce_ops_gpu.cc
@@ -37,7 +37,7 @@ std::unique_ptr<::gloo::Algorithm> initializeAlgorithm(
 }
 
 /**
- * This is a helper function which attemtps to get a base value depending on the
+ * This is a helper function which attempts to get a base value depending on the
  * # of nodes. Larger the base the better performance (up to 4) is what we have
  * observed in gloo benchmarks. At the moment bcube works only if # nodes = base
  * ^ x. Where x is some constant. So, if # node don't match our expectation
diff --git a/caffe2/contrib/gloo/broadcast_ops.h b/caffe2/contrib/gloo/broadcast_ops.h
index 04e18f951bf..5c3af429bd4 100644
--- a/caffe2/contrib/gloo/broadcast_ops.h
+++ b/caffe2/contrib/gloo/broadcast_ops.h
@@ -58,7 +58,7 @@ class BroadcastOp final : public Operator<Context> {
     // Store which inputs/outputs this instance initialized with
     update(init_);
 
-    // Verify inputs == ouputs
+    // Verify inputs == outputs
     CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
     for (auto i = 0; i < init_.inputs.size(); i++) {
       CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
diff --git a/caffe2/contrib/gloo/reduce_scatter_ops.h b/caffe2/contrib/gloo/reduce_scatter_ops.h
index 8581a8aaa71..56113807d54 100644
--- a/caffe2/contrib/gloo/reduce_scatter_ops.h
+++ b/caffe2/contrib/gloo/reduce_scatter_ops.h
@@ -73,7 +73,7 @@ class ReduceScatterOp final : public Operator<Context> {
     // Store which inputs/outputs this instance initialized with
     update(init_);
 
-    // Verify inputs == ouputs
+    // Verify inputs == outputs
     CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
     for (auto i = 0; i < init_.inputs.size(); i++) {
       CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
diff --git a/caffe2/contrib/opencl/OpenCL/cl.hpp b/caffe2/contrib/opencl/OpenCL/cl.hpp
index 5c9be5c5eb0..87cc4a17d10 100644
--- a/caffe2/contrib/opencl/OpenCL/cl.hpp
+++ b/caffe2/contrib/opencl/OpenCL/cl.hpp
@@ -68,7 +68,7 @@
  * The following example shows a general use case for the C++
  * bindings, including support for the optional exception feature and
  * also the supplied vector and string classes, see following sections for
- * decriptions of these features.
+ * descriptions of these features.
  *
  * \code
  * #define __CL_ENABLE_EXCEPTIONS
diff --git a/caffe2/contrib/playground/checkpoint.py b/caffe2/contrib/playground/checkpoint.py
index 1cde83e1a6b..52b9ceb2ff0 100644
--- a/caffe2/contrib/playground/checkpoint.py
+++ b/caffe2/contrib/playground/checkpoint.py
@@ -108,7 +108,7 @@ def broadcast_parameters(opts, model, num_xpus, broadcast_computed_param=False):
         else caffe2_pb2.CPU
     for params in all_params:
         assert len(params) % num_xpus == 0, \
-            "Current model dosen't match device number when loading checkpoint"
+            "Current model doesn't match device number when loading checkpoint"
         params_per_xpu = int(len(params) / num_xpus)
         for idx in range(params_per_xpu):
             blobs = [param for param in params[idx::params_per_xpu]]
diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
index b1d6d79cce5..28ca3c6fb4f 100644
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
@@ -507,7 +507,7 @@ void TensorRTTransformer::Transform(
         return true;
       };
 
-  // function to convert runnbale subgraph into a trt op. Note that to keep the
+  // function to convert runnable subgraph into a trt op. Note that to keep the
   // interface clean, we do the double conversion from C2 op to Onnx ops here
   // but it should be OK as the cost is really small. We also need to keep the
   // same exporter throughout the process to avoid duplicated dummy name
diff --git a/caffe2/core/blob_serialization.h b/caffe2/core/blob_serialization.h
index a7976a2d85f..5309314af0c 100644
--- a/caffe2/core/blob_serialization.h
+++ b/caffe2/core/blob_serialization.h
@@ -38,7 +38,7 @@ CAFFE2_API void SerializeBlob(
 /**
  * @brief Convenience function to serialize a blob to a string.
  *
- * This is a conveinence function to serialize small Blobs that produce
+ * This is a convenience function to serialize small Blobs that produce
  * manageable serialized strings. To serialize big blobs such as
  * large sparse tensors, use the fully-functional interface in
  * blob_serializer_base.h.
diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index 5d674227ac4..2c1b3efb016 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -92,7 +92,7 @@ inline Dst dynamic_cast_if_rtti(Src ptr) {
 }
 
 // SkipIndices are used in operator_fallback_gpu.h and operator_fallback_mkl.h
-// as utilty functions that marks input / output indices to skip when we use a
+// as utility functions that marks input / output indices to skip when we use a
 // CPU operator as the fallback of GPU/MKL operator option.
 template <int... values>
 class SkipIndices {
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index eebba3bfb95..63b53c4e589 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -174,7 +174,7 @@ std::unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
 static std::unordered_map<void*, uint8_t> g_cuda_device_affiliation;
 
 // Data structures for optional memory tracking. Access to these structures
-// is garded by the CUDAContext::mutex.
+// is guarded by the CUDAContext::mutex.
 static std::unordered_map<void*, long> g_size_map;
 static std::vector<long> g_total_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
 static std::vector<long> g_max_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
diff --git a/caffe2/core/memonger.cc b/caffe2/core/memonger.cc
index c391651d849..994c97c6c69 100644
--- a/caffe2/core/memonger.cc
+++ b/caffe2/core/memonger.cc
@@ -471,7 +471,7 @@ class ComputeBlobRecyclingForDag {
     }
   }
 
-  // Rturns true if the op that generates that blob acquires all tokens.
+  // Returns true if the op that generates that blob acquires all tokens.
   inline bool can_use_blob(
       const string& blob_name,
       std::unordered_set<int>* tokens,
diff --git a/caffe2/core/net.h b/caffe2/core/net.h
index 7bfb47fe1ca..7a98615af52 100644
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@@ -76,7 +76,7 @@ class CAFFE2_API NetBase : public Observable<NetBase> {
    * seconds spent during the benchmark. The 0-th item is the time spent per
    * each network run, and if a net instantiation supports run_individual,
    * the remainder of the vector returns the number of milliseconds spent per
-   * opeartor.
+   * operator.
    */
   virtual vector<float> TEST_Benchmark(
       const int /*warmup_runs*/,
diff --git a/caffe2/core/net_async_tracing.cc b/caffe2/core/net_async_tracing.cc
index 29afcdd7b8a..eaf14c581b7 100644
--- a/caffe2/core/net_async_tracing.cc
+++ b/caffe2/core/net_async_tracing.cc
@@ -461,7 +461,7 @@ std::shared_ptr<Tracer> create(
     const std::string& net_name) {
   // Enable the tracer if the net has the "enable_tracing" argument set OR
   // if the command line option includes the net name option in the list of
-  // tracable nets.
+  // traceable nets.
   bool trace_net = hasEnableTracingFlag(net) || isTraceableNetName(net_name);
   return trace_net
       ? std::make_shared<Tracer>(net, net_name, getTracingConfigFromNet(net))
diff --git a/caffe2/core/net_simple_refcount.cc b/caffe2/core/net_simple_refcount.cc
index 36981539bd5..45c8e3500e3 100644
--- a/caffe2/core/net_simple_refcount.cc
+++ b/caffe2/core/net_simple_refcount.cc
@@ -24,7 +24,7 @@ SimpleRefCountNet::SimpleRefCountNet(
 
   std::map<string, int> last_consumed_at;
   std::set<string> created_by_me;
-  // For each opeartor
+  // For each operator
   for (int idx = 0; idx < net_def->op_size(); ++idx) {
     const auto& op_def = net_def->op(idx);
     for (const string& in_name : op_def.input()) {
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index ebc73e17ecb..812fea7be7c 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -254,7 +254,7 @@ struct CAFFE2_API NNModule {
   NNModule(NNModule&&) = default;
   NNModule() {}
 
-  /* Repalce subgraph sg by node, using the order of
+  /* Replace subgraph sg by node, using the order of
    * node_inputs and node_outputs to determine how to link
    * them to the node.  node_inputs *must* enumerate all the
    * inputs to the subgraph (NeuralNetData that do not
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index cda39467fca..4283a304948 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -645,7 +645,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   std::string type_;
   vector<const Blob*> inputs_;
   vector<Blob*> outputs_;
-  // Preferrably use c10::optional, but nvcc doesn't work
+  // Preferably use c10::optional, but nvcc doesn't work
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   std::unique_ptr<const c10::FunctionSchema> fn_schema_;
   vector<c10::IValue> newstyle_inputs_;
diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
index e902fb57a7d..dceaffca7a7 100644
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@@ -131,7 +131,7 @@ class CAFFE2_API OpSchema {
   OpSchema& AllowInplace(std::function<bool(int, int)> inplace);
   OpSchema& AllowInplace(set<std::pair<int, int>> inplace);
   OpSchema& AllowOneToOneInplace();
-  // Sets the rule to enforce in-place opeartion.
+  // Sets the rule to enforce in-place operation.
   OpSchema& EnforceInplace(std::function<bool(int, int)> inplace);
   OpSchema& EnforceInplace(set<std::pair<int, int>> inplace);
   OpSchema& EnforceOneToOneInplace();
diff --git a/caffe2/core/scope_guard.h b/caffe2/core/scope_guard.h
index e8dcfe1a936..ee412a424de 100644
--- a/caffe2/core/scope_guard.h
+++ b/caffe2/core/scope_guard.h
@@ -112,7 +112,7 @@ using ScopeGuardImplDecay = ScopeGuardImpl<typename std::decay<F>::type>;
 /**
  * ScopeGuard is a general implementation of the "Initialization is
  * Resource Acquisition" idiom.  Basically, it guarantees that a function
- * is executed upon leaving the currrent scope unless otherwise told.
+ * is executed upon leaving the current scope unless otherwise told.
  *
  * The MakeGuard() function is used to create a new ScopeGuard object.
  * It can be instantiated with a lambda function, a std::function<void()>,
diff --git a/caffe2/core/static_tracepoint_elfx86.h b/caffe2/core/static_tracepoint_elfx86.h
index cfe3368e1c2..ca62ddcd835 100644
--- a/caffe2/core/static_tracepoint_elfx86.h
+++ b/caffe2/core/static_tracepoint_elfx86.h
@@ -32,7 +32,7 @@
 #define CAFFE_SDT_ARGSIZE(x)  (CAFFE_SDT_ISARRAY(x) ? sizeof(void*) : sizeof(x))
 
 // Format of each probe arguments as operand.
-// Size of the arugment tagged with CAFFE_SDT_Sn, with "n" constraint.
+// Size of the argument tagged with CAFFE_SDT_Sn, with "n" constraint.
 // Value of the argument tagged with CAFFE_SDT_An, with configured constraint.
 #define CAFFE_SDT_ARG(n, x)                                                    \
   [CAFFE_SDT_S##n] "n"                ((size_t)CAFFE_SDT_ARGSIZE(x)),          \
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index c17c92aae60..793b5f611d0 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -278,7 +278,7 @@ class CAFFE2_API Workspace {
                ShouldContinue should_continue = StopOnSignal{});
 
   /*
-   * Returns a CPU threadpool instace for parallel execution of
+   * Returns a CPU threadpool instance for parallel execution of
    * work. The threadpool is created lazily; if no operators use it,
    * then no threadpool will be created.
    */
diff --git a/caffe2/experiments/python/net_construct_bench.py b/caffe2/experiments/python/net_construct_bench.py
index da98100b16c..b7cf605c0c0 100644
--- a/caffe2/experiments/python/net_construct_bench.py
+++ b/caffe2/experiments/python/net_construct_bench.py
@@ -31,7 +31,7 @@ import caffe2.python.models.resnet as resnet
 
 '''
 Simple benchmark that creates a data-parallel resnet-50 model
-and measurs the time.
+and measures the time.
 '''
 
 
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
index b8509163450..5262b6a116a 100644
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@@ -1023,7 +1023,7 @@ void TransformImage(
   ColorNormalization<Context>(image_data, crop, channels, mean, std);
 }
 
-// Only crop / transose the image
+// Only crop / transpose the image
 // leave in uint8_t dataType
 template <class Context>
 void CropTransposeImage(
diff --git a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp
index 4d48b6ffb91..f3badf77750 100644
--- a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp
+++ b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp
@@ -68,7 +68,7 @@
  * The following example shows a general use case for the C++
  * bindings, including support for the optional exception feature and
  * also the supplied vector and string classes, see following sections for
- * decriptions of these features.
+ * descriptions of these features.
  *
  * \code
  * #define __CL_ENABLE_EXCEPTIONS
diff --git a/caffe2/mobile/contrib/nnapi/nnapi.cc b/caffe2/mobile/contrib/nnapi/nnapi.cc
index ee623e617a2..e155e44b63e 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi.cc
@@ -56,7 +56,7 @@ bool NNApi::run(const TensorVector& inputs, TensorVector* outputs) {
   try {
     init(inputs, outputs);
   } catch (const std::exception& e) {
-    LOG(ERROR) << "Error duing model initialization: " << e.what();
+    LOG(ERROR) << "Error during model initialization: " << e.what();
     return false;
   }
 
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 22ce952cb32..ffe0258d55d 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -1657,7 +1657,7 @@ Caffe2BackendRep* Caffe2Backend::Prepare(
     }
   }
 
-  // TODO: avoid extra copy by directly feed initialiers to backend blobs
+  // TODO: avoid extra copy by directly feed initializers to backend blobs
   OnnxToCaffe2(
       &rep->init_net(),
       &rep->pred_net(),
diff --git a/caffe2/onnx/onnx_exporter.cc b/caffe2/onnx/onnx_exporter.cc
index 41735c12fad..910a0b525a4 100644
--- a/caffe2/onnx/onnx_exporter.cc
+++ b/caffe2/onnx/onnx_exporter.cc
@@ -185,7 +185,7 @@ void ssaRewriteForIfOp(
     OperatorDef* op,
     std::unordered_map<std::string, int>* blob_versions,
     std::set<std::string>* is_initialized_tensor) {
-  // Get all the "external" inputs and outpus of the subnet
+  // Get all the "external" inputs and outputs of the subnet
   // Since then_net and else_net has same external input/output, we only collect
   // external input/output from one of its subnet And perform the rewrite to
   // both then_net and else_net
diff --git a/caffe2/onnx/onnx_exporter.h b/caffe2/onnx/onnx_exporter.h
index f922c0dbdec..226b2ec0a73 100644
--- a/caffe2/onnx/onnx_exporter.h
+++ b/caffe2/onnx/onnx_exporter.h
@@ -111,7 +111,7 @@ class CAFFE2_API OnnxExporter {
       const caffe2::OperatorDef& def,
       const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
 
-  // \brief Check black listed arguemnts where we won't pass down when
+  // \brief Check black listed arguments where we won't pass down when
   // converting to ONNX node
   bool IsBlackListed(const caffe2::Argument& arg);
 
diff --git a/caffe2/onnx/torch_ops/defs.cc b/caffe2/onnx/torch_ops/defs.cc
index 24e64933b8b..83209aa0fb7 100644
--- a/caffe2/onnx/torch_ops/defs.cc
+++ b/caffe2/onnx/torch_ops/defs.cc
@@ -138,7 +138,7 @@ ONNX_PYTORCH_OPERATOR_SET_SCHEMA(
     OpSchema()
         .SetDoc("Mirror Caffe2 BatchMatMul operator")
         .Input(0, "X", "tensor of shape (dim0, dim1 ... M, K)", "T")
-        .Input(1, "Y", "tensor of shpae (dim0, dim2 ... K, N)", "T")
+        .Input(1, "Y", "tensor of shape (dim0, dim2 ... K, N)", "T")
         .Output(0, "Z", "tensor of shape (dim0, dim1 ... M, N)", "T")
         .TypeConstraint(
             "T",
diff --git a/caffe2/operators/activation_ops_cudnn.h b/caffe2/operators/activation_ops_cudnn.h
index 30fe9197b96..73fd116a63f 100644
--- a/caffe2/operators/activation_ops_cudnn.h
+++ b/caffe2/operators/activation_ops_cudnn.h
@@ -31,7 +31,7 @@ class CuDNNActivationOpBase : public Operator<CUDAContext> {
       const cudnnDataType_t data_type,
       const int data_size) {
     if (data_size != input_size_) {
-      // Since the best performance is obtained when the tesor is HW-packed, we
+      // Since the best performance is obtained when the tensor is HW-packed, we
       // put X.size() to W.
       input_size_ = data_size;
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
diff --git a/caffe2/operators/batch_bucketize_op.cc b/caffe2/operators/batch_bucketize_op.cc
index 9dde54c32b2..441ecbfea7b 100644
--- a/caffe2/operators/batch_bucketize_op.cc
+++ b/caffe2/operators/batch_bucketize_op.cc
@@ -69,7 +69,7 @@ The lengths is a 1D tensor that splits the following 'boundaries' argument.
 The boundaries is a 1D tensor containing the border list for each feature.
 
 With in each batch, `indices` should not have duplicate number,
-and the number of elements in `indices` should be less than or euqal to `D`.
+and the number of elements in `indices` should be less than or equal to `D`.
 Each element in `lengths` vector (lengths[`i`]) represents
 the number of boundaries in the sub border list.
 The sum of all elements in `lengths` must be equal to the size of  `boundaries`.
diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc
index aeefc131f00..73754ef7271 100644
--- a/caffe2/operators/batch_matmul_op.cc
+++ b/caffe2/operators/batch_matmul_op.cc
@@ -126,7 +126,7 @@ OPERATOR_SCHEMA(BatchMatMul)
 Batch Matrix multiplication Yi = Ai * Bi, where A has shape (dim0, dim1, ... M, K),
 B has shape (dim0, dim1, ... K, N), Y has shape (dim0, dim1, ... M, N) and i ranges
 from 0 to (dim0 * dim1 ...) - 1. rank(A) == rank(B) >= 2. In case of A and B being
-two diemnsional, it behaves like normal matrix multiplication.
+two dimensional, it behaves like normal matrix multiplication.
 )DOC")
     .Input(0, "A", "tensor of shape (dim0, dim1 ... M, K)")
     .Input(1, "B", "tensor of shape (dim0, dim1 ... K, N)")
diff --git a/caffe2/operators/bisect_percentile_op.cc b/caffe2/operators/bisect_percentile_op.cc
index cec3cf7bc1f..7f8c767fa31 100644
--- a/caffe2/operators/bisect_percentile_op.cc
+++ b/caffe2/operators/bisect_percentile_op.cc
@@ -46,7 +46,7 @@ OPERATOR_SCHEMA(BisectPercentile)
     R_2 = [0.3, 1.2];
     We will build R = [0.1, 0.4, 0.5, 0.3, 1.2]; besides, we have
     lengths = [3, 2]
-    to indicate the boundries of the percentile information.
+    to indicate the boundaries of the percentile information.
 
 )DOC")
     .Arg(
diff --git a/caffe2/operators/box_with_nms_limit_op.h b/caffe2/operators/box_with_nms_limit_op.h
index c5c7159cd77..fbaea60bcd7 100644
--- a/caffe2/operators/box_with_nms_limit_op.h
+++ b/caffe2/operators/box_with_nms_limit_op.h
@@ -51,7 +51,7 @@ class BoxWithNMSLimitOp final : public Operator<Context> {
         "Unexpected soft_nms_method");
     soft_nms_method_ = (soft_nms_method_str_ == "linear") ? 1 : 2;
 
-    // When input `boxes` doesn't inlcude background class, the score will skip
+    // When input `boxes` doesn't include background class, the score will skip
     // background class and start with foreground classes directly, and put the
     // background class in the end, i.e. score[:, 0:NUM_CLASSES-1] represents
     // foreground classes and score[:,NUM_CLASSES] represents background class.
diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc
index 8c510f7387b..285a74cf41b 100644
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@@ -97,7 +97,7 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
   }
 
  protected:
-  // A helper function to set up the tensor Nd desriptor, depending on the order
+  // A helper function to set up the tensor Nd descriptor, depending on the order
   // the group and the type given.
   template <typename T>
   void SetTensorNdDescriptorWithGroup(
diff --git a/caffe2/operators/crf_viterbi_op.cc b/caffe2/operators/crf_viterbi_op.cc
index 07630a689b1..1557ddd9357 100644
--- a/caffe2/operators/crf_viterbi_op.cc
+++ b/caffe2/operators/crf_viterbi_op.cc
@@ -209,7 +209,7 @@ OPERATOR_SCHEMA(SwapBestPath)
     .NumInputs(2)
     .NumOutputs(1)
     .SetDoc(R"DOC(
-Given a sequence of idices and a matrix, enforce that these indices have the
+Given a sequence of indices and a matrix, enforce that these indices have the
 best columnwise scores
 score
 )DOC")
diff --git a/caffe2/operators/fused_rowwise_random_quantization_ops.cc b/caffe2/operators/fused_rowwise_random_quantization_ops.cc
index e7cb974e105..6854b84ab9b 100644
--- a/caffe2/operators/fused_rowwise_random_quantization_ops.cc
+++ b/caffe2/operators/fused_rowwise_random_quantization_ops.cc
@@ -170,7 +170,7 @@ In Advances in Neural Information Processing Systems, pp. 1508-1518. 2017.
 )DOC")
     .Input(0, "input", "Float32 input data")
     .Output(0, "output", "Fused bitwidth, tail, min, max and quantized data")
-    .Arg("bitwidth", "How many bits to quantiz per data (defaults to 8).")
+    .Arg("bitwidth", "How many bits to quantize per data (defaults to 8).")
     .Arg("random", "random or not (True). False is set up for unittest.");
 NO_GRADIENT(FloatToFusedRandRowwiseQuantized);
 
diff --git a/caffe2/operators/gather_op.h b/caffe2/operators/gather_op.h
index d28bffff376..52f45c19898 100644
--- a/caffe2/operators/gather_op.h
+++ b/caffe2/operators/gather_op.h
@@ -184,7 +184,7 @@ class GatherOp : public Operator<Context> {
     // an error.
     // Right now, we apply index wrapping by default only to axis == 0,
     // since we have ONNX conversion code that uses it. For other ops it
-    // needs to be speified explicitly with argument or you don't get it.
+    // needs to be specified explicitly with argument or you don't get it.
     if (OperatorBase::HasArgument("wrap_indices")) {
       wrap_indices_ = Operator<Context>::template GetSingleArgument<bool>(
           "wrap_indices", (false));
diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h
index bc594ea5697..37900a7d4dd 100644
--- a/caffe2/operators/generate_proposals_op.h
+++ b/caffe2/operators/generate_proposals_op.h
@@ -69,7 +69,7 @@ CAFFE2_API ERArrXXf ComputeSortedAnchors(
 } // namespace utils
 
 // C++ implementation of GenerateProposalsOp
-// Generate bounding box proposals for Faster RCNN. The propoasls are generated
+// Generate bounding box proposals for Faster RCNN. The proposals are generated
 //     for a list of images based on image score 'score', bounding box
 //     regression result 'deltas' as well as predefined bounding box shapes
 //     'anchors'. Greedy non-maximum suppression is applied to generate the
diff --git a/caffe2/operators/h_softmax_op.cc b/caffe2/operators/h_softmax_op.cc
index f92bfec63f5..f7b0dfb59f4 100644
--- a/caffe2/operators/h_softmax_op.cc
+++ b/caffe2/operators/h_softmax_op.cc
@@ -632,7 +632,7 @@ search tree.
     .Arg("topN", "Number of nodes in outputs")
     .Input(0, "X", "Input data from previous layer")
     .Input(1, "W", "The matrix trained from Softmax Ops")
-    .Input(2, "b", "The bias traiend from Softmax Ops")
+    .Input(2, "b", "The bias trained from Softmax Ops")
     .Output(
         0,
         "Y_names",
diff --git a/caffe2/operators/heatmap_max_keypoint_op.cc b/caffe2/operators/heatmap_max_keypoint_op.cc
index 441e970c99e..812a768be3a 100644
--- a/caffe2/operators/heatmap_max_keypoint_op.cc
+++ b/caffe2/operators/heatmap_max_keypoint_op.cc
@@ -140,7 +140,7 @@ bool HeatmapMaxKeypointOp<float, CPUContext>::RunOnDevice() {
       }
       assert(std::abs(delta(0)) <= MAX_DELTA);
       assert(std::abs(delta(1)) <= MAX_DELTA);
-      // find maximum of detla scores
+      // find maximum of delta scores
       keypoints(k, 0 * keypoint_count + j) =
           x0 + (0.5 + maxX + delta(0)) * xLen / heatmap_size;
       keypoints(k, 1 * keypoint_count + j) =
diff --git a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
index adf3dcc8932..96041163cd8 100644
--- a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
+++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
@@ -74,7 +74,7 @@ class SparseLengths8BitsRowwiseOp : public Operator<Context> {
         in_block_size,
         outputSize,
         indices_size,
-        N, // embeding table length
+        N, // embedding table length
         input_data,
         indices,
         lengths,
diff --git a/caffe2/operators/load_save_op_util.cc b/caffe2/operators/load_save_op_util.cc
index 7b94348e034..e4ca762e479 100644
--- a/caffe2/operators/load_save_op_util.cc
+++ b/caffe2/operators/load_save_op_util.cc
@@ -27,7 +27,7 @@ void ProcessBlob(
   auto& blob_states = *blob_states_ptr;
   if (blob_states.count(key) == 0) {
     // We reset the blob so that any existing content is destroyed. This
-    // is to guaranee correct device placement: if we are deserializing
+    // is to guarantee correct device placement: if we are deserializing
     // into a TensorCUDA, without explicit Reset we might be loading data
     // into an existing TensorCUDA that has pre-allocated memory on a
     // different GPU.
diff --git a/caffe2/operators/op_utils_cudnn.h b/caffe2/operators/op_utils_cudnn.h
index b76a1826d33..0ea76855b84 100644
--- a/caffe2/operators/op_utils_cudnn.h
+++ b/caffe2/operators/op_utils_cudnn.h
@@ -46,7 +46,7 @@ inline void LogCuDNNPerfStats(
 
 // Easier indexing into force_algo_ vector,
 // shared by CudnnConvTransposeOpBase and CudnnConvOpBase to force
-// usage of a particular algortihm instead of searching
+// usage of a particular algorithm instead of searching
 enum { ALGO_FWD = 0, ALGO_WGRAD = 1, ALGO_DGRAD = 2 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/pool_op_util.cc b/caffe2/operators/pool_op_util.cc
index 183d8104114..ffcb0c8f6bb 100644
--- a/caffe2/operators/pool_op_util.cc
+++ b/caffe2/operators/pool_op_util.cc
@@ -9,7 +9,7 @@ namespace {
 
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 
-// Vectorizes 4x4p0s0 averge pooling for ARM NEON
+// Vectorizes 4x4p0s0 average pooling for ARM NEON
 void AvgPoolNeon4x4p0s0Plane(
     int inputH,
     int inputW,
@@ -103,7 +103,7 @@ void AvgPoolNeon4x4p0s0Plane(
   }
 }
 
-// Vectorizes 2x2p0s0 averge pooling for ARM NEON
+// Vectorizes 2x2p0s0 average pooling for ARM NEON
 void MaxPoolNeon2x2p0s0Plane(
     int inputH,
     int inputW,
diff --git a/caffe2/operators/reservoir_sampling.cc b/caffe2/operators/reservoir_sampling.cc
index 0e125ddb602..32378623dbf 100644
--- a/caffe2/operators/reservoir_sampling.cc
+++ b/caffe2/operators/reservoir_sampling.cc
@@ -261,7 +261,7 @@ This operator is thread-safe.
     .Input(
         5,
         "OBJECT_TO_POS_MAP_IN",
-        "(Optional) Auxillary bookkeeping map. This should be created from "
+        "(Optional) Auxiliary bookkeeping map. This should be created from "
         " `CreateMap` with keys of type int64 and values of type int32")
     .Input(
         6,
diff --git a/caffe2/operators/rnn/recurrent_network_executor_incl.h b/caffe2/operators/rnn/recurrent_network_executor_incl.h
index 90311bbda4d..56914380b50 100644
--- a/caffe2/operators/rnn/recurrent_network_executor_incl.h
+++ b/caffe2/operators/rnn/recurrent_network_executor_incl.h
@@ -8,7 +8,7 @@
 namespace caffe2 {
 
 /**
- * Struct for operator in a timestep and its dependenceis.
+ * Struct for operator in a timestep and its dependencies.
  */
 struct RNNNetOperator {
   int order; // Position in the step net (i.e nth operator)
diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h
index 5bda5d3d9b9..b2441de4316 100644
--- a/caffe2/operators/segment_reduction_op.h
+++ b/caffe2/operators/segment_reduction_op.h
@@ -2047,7 +2047,7 @@ i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
       SIndex,
       Context,
       ReducerGradient>;
-  // Will return 3 input version. This is aliging new CPU/GPU nets.
+  // Will return 3 input version. This is aligning new CPU/GPU nets.
   using GetGradient = LengthsOpGetGradient<
       ForwardOp,
       ReducerDef,
diff --git a/caffe2/operators/sparse_normalize_op.cc b/caffe2/operators/sparse_normalize_op.cc
index 1c6dfa5596b..516955c108e 100644
--- a/caffe2/operators/sparse_normalize_op.cc
+++ b/caffe2/operators/sparse_normalize_op.cc
@@ -63,7 +63,7 @@ OPERATOR_SCHEMA(SparseNormalize)
         "A bool variable to control whether to use max norm \
     or constant norm. When use_max_norm = false, constant norm is used so that \
     all the embedding vectors are scaled to have a L2 norm equals to A \
-    (see blow arugment norm=A). If use_max_norm = true, \
+    (see blow argument norm=A). If use_max_norm = true, \
     max norm is used so that embedding is scaled so that its l2 norm is no larger \
     than A. If an embedding's norm is less than A originally, \
     the embedding is left unchanged.\
diff --git a/caffe2/operators/string_ops.h b/caffe2/operators/string_ops.h
index 59e81ee894d..49cc322267f 100644
--- a/caffe2/operators/string_ops.h
+++ b/caffe2/operators/string_ops.h
@@ -11,7 +11,7 @@ namespace caffe2 {
  * into the elementwise Functor provided, and gathers the results of each
  * call into the resulting array. Use it as an adaptor if you want to create
  * a UnaryElementwiseOp that acts on each element of the tensor per function
- * call -- this is resonable for complex types where vectorization wouldn't
+ * call -- this is reasonable for complex types where vectorization wouldn't
  * be much of a gain, performance-wise.
  */
 template <typename Functor>
diff --git a/caffe2/operators/stump_func_op.cc b/caffe2/operators/stump_func_op.cc
index e5d022fc827..457aca841bf 100644
--- a/caffe2/operators/stump_func_op.cc
+++ b/caffe2/operators/stump_func_op.cc
@@ -87,7 +87,7 @@ OPERATOR_SCHEMA(StumpFuncIndex)
         "Index_High",
         "tensor of int64 indices for elements above threshold")
     .SetDoc(R"DOC(
-Split the elemnts and return the indices based on the given threshold.
+Split the elements and return the indices based on the given threshold.
 )DOC");
 
 NO_GRADIENT(StumpFuncIndex);
diff --git a/caffe2/operators/summarize_op.cu b/caffe2/operators/summarize_op.cu
index 65b2814b5e9..ca57066552d 100644
--- a/caffe2/operators/summarize_op.cu
+++ b/caffe2/operators/summarize_op.cu
@@ -48,7 +48,7 @@ struct summary_stats_unary_op {
 // summary_stats_binary_op is a functor that accepts two SummaryStatsData
 // structs and returns a new SummaryStatsData which are an
 // approximation to the summary_stats for
-// all values that have been agregated so far
+// all values that have been aggregated so far
 template <typename T>
 struct summary_stats_binary_op
     : public thrust::binary_function<const SummaryStatsData<T>&,
diff --git a/caffe2/operators/tile_op.h b/caffe2/operators/tile_op.h
index ad0b924e407..360b58a0934 100644
--- a/caffe2/operators/tile_op.h
+++ b/caffe2/operators/tile_op.h
@@ -200,7 +200,7 @@ class TileGradientOp final : public Operator<Context> {
      * This is equivalent to multiplying by a vector of 1s transposed.
      * The gradient of this is all 1s in the shape of the input matrix
      * (call it X).
-     * So the output gradient should be the matrix multipication result
+     * So the output gradient should be the matrix multiplication result
      * of input gradient (gradient of tiled tensor output) and X.
      */
     const T* dY_data = dY.template data<T>();
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
index 0893448cbb4..ae31ec9a986 100644
--- a/caffe2/operators/utility_ops.cu
+++ b/caffe2/operators/utility_ops.cu
@@ -100,7 +100,7 @@ bool NanCheckOp<CUDAContext>::RunOnDevice() {
     for (int j = 0; j < InputSize(); j++) {
       Tensor cpu_X(CPU);
       cpu_X.ResizeLike(Input(j));
-      // Hack to cause allocaiton happen here, so it won't happen
+      // Hack to cause allocation happen here, so it won't happen
       // when we do CopyFrom. We need the mutex then because host->gpu
       // copies seem to possibly lock with NCCL.
       cpu_X.mutable_data<float>();
diff --git a/caffe2/opt/backend_cutting.cc b/caffe2/opt/backend_cutting.cc
index f1fc6214920..e1f7808d48b 100644
--- a/caffe2/opt/backend_cutting.cc
+++ b/caffe2/opt/backend_cutting.cc
@@ -389,7 +389,7 @@ caffe2::NetDef OptimizeForBackend(
     }
   }
 
-  // Find unsupported and supported groups of nodes alernatively
+  // Find unsupported and supported groups of nodes alternatively
   context.frontier.clear();
   context.current_group.clear();
   context.find_supported = false;
diff --git a/caffe2/opt/backend_transformer_base.h b/caffe2/opt/backend_transformer_base.h
index 8a3d58d685c..033cd55449f 100644
--- a/caffe2/opt/backend_transformer_base.h
+++ b/caffe2/opt/backend_transformer_base.h
@@ -93,7 +93,7 @@ class BackendTransformerBase {
   // Input mapping of input name -> original input name
   std::unordered_map<std::string, std::string> input_mapping_;
 
-  // Input mapping of orignal input name -> input name
+  // Input mapping of original input name -> input name
   std::unordered_map<std::string, std::string> reverse_input_mapping_;
 };
 } // namespace caffe2
diff --git a/caffe2/opt/custom/glow_net_transform.cc b/caffe2/opt/custom/glow_net_transform.cc
index 2276bf72912..0dfcbdbb242 100644
--- a/caffe2/opt/custom/glow_net_transform.cc
+++ b/caffe2/opt/custom/glow_net_transform.cc
@@ -42,7 +42,7 @@ C10_DEFINE_string(
 C10_DEFINE_string(
     onnxifi_input_output_observe_list,
     "",
-    "A list of net positins whose corresponding op's inputs and outputs will be"
+    "A list of net positions whose corresponding op's inputs and outputs will be"
     " observed. ");
 
 namespace caffe2 {
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index 17ba0b1bc75..25f232700cc 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -694,7 +694,7 @@ bool OnnxifiTransformer::supportOpOnnx(
       for (const auto& i : n.input()) {
         bool is_new = used_inputs.emplace(i).second;
         // The input is not seen and it's not referred by any nodes before as
-        // output, we count it as an boudary input
+        // output, we count it as an boundary input
         if (is_new && !used_outputs.count(i)) {
           boundary_inputs.emplace_back(i);
         }
diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h
index 4307e02fdf8..6c0326eb79e 100644
--- a/caffe2/opt/onnxifi_transformer.h
+++ b/caffe2/opt/onnxifi_transformer.h
@@ -22,7 +22,7 @@ struct OnnxifiTransformerOptions final : public BackendTransformOptions {
   // Pass serialized onnx model if true, otherwise pass serialized c2 model
   bool use_onnx{false};
 
-  // Whether to adjust batch at the ouptuts or not
+  // Whether to adjust batch at the outputs or not
   bool adjust_batch{true};
 
   // Whether to lower model blob by blob
diff --git a/caffe2/opt/tvm_transformer.h b/caffe2/opt/tvm_transformer.h
index 57ad9bae679..b75c8a4b1a6 100644
--- a/caffe2/opt/tvm_transformer.h
+++ b/caffe2/opt/tvm_transformer.h
@@ -46,7 +46,7 @@ class CAFFE2_API TvmTransformer final : public BackendTransformerBase {
       const std::unordered_set<std::string>& weights,
       const ShapeInfoMap& shape_hints);
 
-  // Apply transform to cluser connected TVM runnable ops into one TVMJitOp
+  // Apply transform to cluster connected TVM runnable ops into one TVMJitOp
   NetDef applyTvmTransform(
       NetDef* pred_net,
       const std::unordered_set<std::string>& weights,
diff --git a/caffe2/perfkernels/adagrad.h b/caffe2/perfkernels/adagrad.h
index e84e33076f0..0ebb93a826a 100644
--- a/caffe2/perfkernels/adagrad.h
+++ b/caffe2/perfkernels/adagrad.h
@@ -36,12 +36,12 @@ static inline void adagrad_update_base_inlined(
 // version with prefetching
 // TODO(msmelyan)
 // Crux of the computation is computing a  / (sqrt(b) + epsilon),
-// where a and b are vectors and epislon is very small (eg., 10^-5) and does not
+// where a and b are vectors and epsilon is very small (eg., 10^-5) and does not
 // change. Today it's computed using two vector sqrt and vector divide simd
 // instructions. It is slow. We can take advantage of existing fast vector
 // VRSQRTPS instruction that computes approximate reciprocals of square roots
 // of the vector. It is 6x faster than vsrt and vdiv combinations. Since the
-// addition of epislon is just done to avoid division by zero, we approximate a
+// addition of epsilon is just done to avoid division by zero, we approximate a
 // / (sqrt(b) + epsilon) by a / (sqrt(b + sqrt(epsilon)) If we do that, we can
 // use VRSQRTPS instead now. VRSQRTPS is not very accurate. Specifically, for
 // the test on random numbers between 0.1 and 1 the absolute error was about
@@ -183,12 +183,12 @@ inline void rowwise_adagrad_update_inlined(
 // version with prefetching
 // TODO(msmelyan)
 // Crux of the computation is computing a  / (sqrt(b) + epsilon),
-// where a and b are vectors and epislon is very small (eg., 10^-5) and does not
+// where a and b are vectors and epsilon is very small (eg., 10^-5) and does not
 // change. Today it's computed using two vector sqrt and vector divide simd
 // instructions. It is slow. We can take advantage of existing fast vector
 // VRSQRTPS instruction that computes approximate reciprocals of square roots
 // of the vector. It is 6x faster than vsrt and vdiv combinations. Since the
-// addition of epislon is just done to avoid division by zero, we approximate a
+// addition of epsilon is just done to avoid division by zero, we approximate a
 // / (sqrt(b) + epsilon) by a / (sqrt(b + sqrt(epsilon)) If we do that, we can
 // use VRSQRTPS instead now. VRSQRTPS is not very accurate. Specifically, for
 // the test on random numbers between 0.1 and 1 the absolute error was about
diff --git a/caffe2/predictor/emulator/benchmark.cc b/caffe2/predictor/emulator/benchmark.cc
index e45b1307474..0f6f7fd6cbe 100644
--- a/caffe2/predictor/emulator/benchmark.cc
+++ b/caffe2/predictor/emulator/benchmark.cc
@@ -23,7 +23,7 @@ C10_DEFINE_string(
     input_dims,
     "",
     "The path of the file that "
-    "stores input dimesions of all the operators in the run net. "
+    "stores input dimensions of all the operators in the run net. "
     "Each element of the array is a mapping from "
     "operator index to its input dimension.");
 C10_DEFINE_string(
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index e694e813de4..39d6a791b34 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -397,7 +397,7 @@ message ExecutionStep {
   optional int64 run_every_ms = 11;
 
   // If false or not set, execute sub-steps serially.
-  // If true, execute all substeps concurrently, each one in a separte thread.
+  // If true, execute all substeps concurrently, each one in a separate thread.
   optional bool concurrent_substeps = 6;
 
   // Name of a scalar boolean tensor.
diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py
index 73ef06999a5..cdd96eb1f49 100644
--- a/caffe2/python/checkpoint.py
+++ b/caffe2/python/checkpoint.py
@@ -382,7 +382,7 @@ class CheckpointManager(object):
         Args:
             user_epoch: An integer. Optional parameter for user to explicitly
                 identify the epoch-id to load checkpoint from
-        Retruns:
+        Returns:
             epoch: the epoch-id to load checkpoints from
                 or None if no checkpoints were written
         """
@@ -586,7 +586,7 @@ class MultiNodeCheckpointManager(object):
         Args:
             user_epoch: An integer. Optional parameter for user to explicitly
                 identify the epoch-id to load checkpoint from
-        Retruns:
+        Returns:
             epoch: the epoch-id to load checkpoints from
                 or None if no checkpoints were written
         """
diff --git a/caffe2/python/crf.py b/caffe2/python/crf.py
index 8265d44fbc2..a009f8f0fa3 100644
--- a/caffe2/python/crf.py
+++ b/caffe2/python/crf.py
@@ -126,7 +126,7 @@ class CRFWithLoss(object):
             [], value=0, shape=[self.num_classes_padded], dtype=core.DataType.INT32
         )
 
-        # Compute the accumlated total score of all the paths
+        # Compute the accumulated total score of all the paths
         accum_score = self.model.net.SortedSegmentRangeLogSumExp(
             [out_last, zero_segment_id]
         )
diff --git a/caffe2/python/dataio_test.py b/caffe2/python/dataio_test.py
index 7f222394621..bf2a91d7b6a 100644
--- a/caffe2/python/dataio_test.py
+++ b/caffe2/python/dataio_test.py
@@ -93,7 +93,7 @@ class TestCompositeReader(TestCase):
         for d, offset in zip(data, offsets):
             npt.assert_array_equal(d, range(offset, offset + size))
 
-        # Make an identically-sized empty destnation dataset
+        # Make an identically-sized empty destination dataset
         dst_ds_schema = schema.Struct(
             *[
                 (name, src_ds.content().clone_schema())
@@ -126,7 +126,7 @@ class TestCompositeReader(TestCase):
             for (name, offset) in zip(names, offsets)
         ]
 
-        # Make an identically-sized empty destnation dataset
+        # Make an identically-sized empty destination dataset
         dst_ds_schema = schema.Struct(
             *[
                 (name, src_ds_builder.schema())
diff --git a/caffe2/python/examples/imagenet_trainer.py b/caffe2/python/examples/imagenet_trainer.py
index adf1b66ef22..e6ae6eebd2d 100644
--- a/caffe2/python/examples/imagenet_trainer.py
+++ b/caffe2/python/examples/imagenet_trainer.py
@@ -439,7 +439,7 @@ def Train(args):
         stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
 
         if args.float16_compute:
-            # TODO: merge with multi-prceision optimizer
+            # TODO: merge with multi-precision optimizer
             opt = optimizer.build_fp16_sgd(
                 model,
                 args.base_learning_rate,
diff --git a/caffe2/python/functional.py b/caffe2/python/functional.py
index 166afc5d5a7..7c26f69a0c4 100644
--- a/caffe2/python/functional.py
+++ b/caffe2/python/functional.py
@@ -83,7 +83,7 @@ class _Functional(object):
                 if schema.inf == max_output:
                     raise ValueError(
                         "For operators with max_output == inf,\
-                        user should pass num_output explicity."
+                        user should pass num_output explicitly."
                     )
                 output_names = get_name_list(
                     output_prefix, max_output, max_output
diff --git a/caffe2/python/helpers/fc.py b/caffe2/python/helpers/fc.py
index e60d008799b..9d61dc7ac14 100644
--- a/caffe2/python/helpers/fc.py
+++ b/caffe2/python/helpers/fc.py
@@ -186,7 +186,7 @@ def fc_sparse(
     model, blob_in, blob_out, w_csr, iw, jw, bias,
     **kwargs
 ):
-    """FC_Sparse: Only takes in alocated weights"""
+    """FC_Sparse: Only takes in allocated weights"""
     if not (w_csr and iw and jw and bias):
         print("Warning...")
     model.AddParameter(w_csr)
diff --git a/caffe2/python/layers/batch_normalization.py b/caffe2/python/layers/batch_normalization.py
index 12c10dfcdd9..9fe3ee51eb5 100644
--- a/caffe2/python/layers/batch_normalization.py
+++ b/caffe2/python/layers/batch_normalization.py
@@ -38,7 +38,7 @@ class BatchNormalization(ModelLayer):
                 raise ValueError("Please specify a correct order")
         else:
             assert len(self.input_shape) == 1, (
-                "This layer supports only 4D or 2D tesnors")
+                "This layer supports only 4D or 2D tensors")
             input_dims = self.input_shape[0]
 
         self.output_schema = schema.Scalar(
diff --git a/caffe2/python/layers/feature_sparse_to_dense.py b/caffe2/python/layers/feature_sparse_to_dense.py
index c11424f1698..a17e469b742 100644
--- a/caffe2/python/layers/feature_sparse_to_dense.py
+++ b/caffe2/python/layers/feature_sparse_to_dense.py
@@ -116,7 +116,7 @@ class FeatureSparseToDense(ModelLayer):
                 # we keep ranges blob to check input data later.
                 # Currently this schema with ranges and values is only for
                 # generic type enum 1. If new types are implemented, we need to
-                # modify the ParseGeneric operator, and this part accordinly
+                # modify the ParseGeneric operator, and this part accordingly
                 outputs.append(
                     (
                         field,
@@ -260,7 +260,7 @@ class FeatureSparseToDense(ModelLayer):
                 # Currently our implementation only supports
                 # generic type enum 1. If new types are implemented, we need to
                 # modify the ParseGeneric operator, the schema above,
-                # and this part accordinly to parse the generic feature strings
+                # and this part accordingly to parse the generic feature strings
                 # into input_record
 
                 ranges = net.LengthsToRanges(
diff --git a/caffe2/python/layers/functional.py b/caffe2/python/layers/functional.py
index 84e4f992792..53d5c050242 100644
--- a/caffe2/python/layers/functional.py
+++ b/caffe2/python/layers/functional.py
@@ -90,7 +90,7 @@ class Functional(ModelLayer):
                 elif shapes[blob][0] == 0:
                     shape = tuple(shapes[blob][1:])
                 else:
-                    logger.warning("unexpeced shape: {}".format(shapes[blob]))
+                    logger.warning("unexpected shape: {}".format(shapes[blob]))
                     # If batch dimension is not first - give up on shape
                     # inference for that blob
                     had_issues = True
diff --git a/caffe2/python/layers/layer_normalization.py b/caffe2/python/layers/layer_normalization.py
index 5ec2973f461..0dc6795994c 100644
--- a/caffe2/python/layers/layer_normalization.py
+++ b/caffe2/python/layers/layer_normalization.py
@@ -33,7 +33,7 @@ class LayerNormalization(ModelLayer):
         self.axis = axis
 
         assert len(self.input_shape) >= 1, (
-            "This layer supports only >= 2D tesnors")
+            "This layer supports only >= 2D tensors")
         input_dims = self.input_shape[0]
 
         self.output_schema = schema.Scalar(
diff --git a/caffe2/python/layers/random_fourier_features.py b/caffe2/python/layers/random_fourier_features.py
index 23b6bed8153..6056da4ba7c 100644
--- a/caffe2/python/layers/random_fourier_features.py
+++ b/caffe2/python/layers/random_fourier_features.py
@@ -23,8 +23,8 @@ class RandomFourierFeatures(ModelLayer):
     Inputs:
         output_dims -- output feature dimensions
         sigma -- bandwidth for the Gaussian kernel estimator
-        w_init -- initalization options for weight parameter
-        b_init -- initalization options for bias parameter
+        w_init -- initialization options for weight parameter
+        b_init -- initialization options for bias parameter
 
     """
     def __init__(
diff --git a/caffe2/python/layers/select_record_by_context.py b/caffe2/python/layers/select_record_by_context.py
index 87b1ef0a3c1..65e44bece97 100644
--- a/caffe2/python/layers/select_record_by_context.py
+++ b/caffe2/python/layers/select_record_by_context.py
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
 
 class SelectRecordByContext(ModelLayer):
     """
-    Allowing model to follow different paths for each instatiation context and
+    Allowing model to follow different paths for each instantiation context and
     join later at some point. The implementation use `Alias` because schema
     sometimes clone fields internally so we need static blob name for output
     """
diff --git a/caffe2/python/layers/tags.py b/caffe2/python/layers/tags.py
index 8f785e284c4..a9ccbc6c932 100644
--- a/caffe2/python/layers/tags.py
+++ b/caffe2/python/layers/tags.py
@@ -75,7 +75,7 @@ class Tags(object):
 
     # In certain cases we want to have different schema for training and
     # prediction, as an example in prediction we might need to have only
-    # subset of ids present in the orignal schema. This tag is one of the ways
+    # subset of ids present in the original schema. This tag is one of the ways
     # to mark operators that will be removed from prediction and should
     # override schema for predictors.
     PREDICTION_SCHEMA = 'prediction_schema'
diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py
index f69dce4f7f9..c003e0e3b09 100644
--- a/caffe2/python/mkl/rewrite_graph.py
+++ b/caffe2/python/mkl/rewrite_graph.py
@@ -78,7 +78,7 @@ def rewrite_run_net_simple(net):
             core.DeviceOption(device_type=device))
         op.engine = ""
 
-    # Temporarily disbale conv+relu fusion until we verify further
+    # Temporarily disable conv+relu fusion until we verify further
     # net.ParseFromString(
     #     C.transform_optimizeForMKLDNN(net.SerializeToString()))
     fix_BoxWithNMSLimit(net)
@@ -202,7 +202,7 @@ def rewrite_run_net_simple_xrayocr_lstm(net):
                             else cpu_tmp(blob))
                     arg.n.external_input[:] = new_external_input
 
-    # Temporarily disbale conv+relu fusion until we verify further
+    # Temporarily disable conv+relu fusion until we verify further
     # net.ParseFromString(
     #     C.transform_optimizeForMKLDNN(net.SerializeToString()))
     fix_BoxWithNMSLimit(net)
diff --git a/caffe2/python/modeling/initializers.py b/caffe2/python/modeling/initializers.py
index aa7617b026f..c7f3846f9fe 100644
--- a/caffe2/python/modeling/initializers.py
+++ b/caffe2/python/modeling/initializers.py
@@ -12,7 +12,7 @@ import six
 class Initializer(object):
     '''
     This class abstracts out parameter creation. One can come up with a new
-    Initializer in order to implement more complex parameter initializaion logic
+    Initializer in order to implement more complex parameter initialization logic
     '''
 
     def __init__(self, operator_name=None, **kwargs):
diff --git a/caffe2/python/modeling/parameter_sharing.py b/caffe2/python/modeling/parameter_sharing.py
index 80590f43afd..77e5cbd3f8b 100644
--- a/caffe2/python/modeling/parameter_sharing.py
+++ b/caffe2/python/modeling/parameter_sharing.py
@@ -57,7 +57,7 @@ class ParameterSharingContext(object):
         candidate_scope = scope.CurrentNameScope()
         best_scope = self._resolve_scope_overrides(candidate_scope)
         if best_scope != candidate_scope:
-            logger.info("Overwiting scope {0} with scope {1}".format(
+            logger.info("Overwriting scope {0} with scope {1}".format(
                 candidate_scope, best_scope))
 
         return best_scope + name
diff --git a/caffe2/python/net_drawer.py b/caffe2/python/net_drawer.py
index 17f6c4b0009..1fd0833a718 100644
--- a/caffe2/python/net_drawer.py
+++ b/caffe2/python/net_drawer.py
@@ -119,7 +119,7 @@ def GetPydotGraph(
             graph.add_edge(pydot.Edge(input_node, op_node))
         for output_name in op.output:
             if output_name in pydot_nodes:
-                # we are overwriting an existing blob. need to updat the count.
+                # we are overwriting an existing blob. need to update the count.
                 pydot_node_counts[output_name] += 1
             output_node = blob_node_producer(
                 _escape_label(
diff --git a/caffe2/python/onnx/ONNXOpCoverage.md b/caffe2/python/onnx/ONNXOpCoverage.md
index a3743f0bf63..bb4b71f0553 100644
--- a/caffe2/python/onnx/ONNXOpCoverage.md
+++ b/caffe2/python/onnx/ONNXOpCoverage.md
@@ -89,7 +89,7 @@ This doc keeps tracking why operators are not covered by the testcases.
 |Sigmoid|Yes|OK|&#x1F49A;OK|
 |Sin|Yes|OK|&#x1F49A;OK|
 |Size|Yes|OK|&#x1F49A;OK|
-|Slice|Yes|OK|&#x1F494;ScatterAssign + Cast, very hacky implementaion, Slice in C2 only supports one dimension|
+|Slice|Yes|OK|&#x1F494;ScatterAssign + Cast, very hacky implementation, Slice in C2 only supports one dimension|
 |Softmax|Yes|OK|&#x1F494;Axis and dim has different semantics|
 |Softplus|Yes|OK|&#x1F49A;OK|
 |Softsign|Yes||&#x1F49A;OK|
diff --git a/caffe2/python/operator_test/batch_bucketize_op_test.py b/caffe2/python/operator_test/batch_bucketize_op_test.py
index 301941afb59..c7a1b7992cf 100644
--- a/caffe2/python/operator_test/batch_bucketize_op_test.py
+++ b/caffe2/python/operator_test/batch_bucketize_op_test.py
@@ -57,7 +57,7 @@ class TestBatchBucketize(serial.SerializedTestCase):
         indices.sort()
         boundaries = []
         for i in range(d - 3):
-            # add [0, 0] as duplicated bounary for duplicated bucketization
+            # add [0, 0] as duplicated boundary for duplicated bucketization
             if lens[i] > 2:
                 cur_boundary = np.append(
                     np.random.randn(lens[i] - 2) * 5, [0, 0])
diff --git a/caffe2/python/operator_test/box_with_nms_limit_op_test.py b/caffe2/python/operator_test/box_with_nms_limit_op_test.py
index 5c6fd368d86..fad93bbe523 100644
--- a/caffe2/python/operator_test/box_with_nms_limit_op_test.py
+++ b/caffe2/python/operator_test/box_with_nms_limit_op_test.py
@@ -142,7 +142,7 @@ class TestBoxWithNMSLimitOp(serial.SerializedTestCase):
         boxes, scores = gen_multiple_boxes(in_centers, in_scores, 10, num_classes)
 
         if not input_boxes_include_bg_cls:
-            # remove backgound class
+            # remove background class
             boxes = boxes[:, 4:]
         if cls_agnostic_bbox_reg:
             # only leave one class
@@ -159,7 +159,7 @@ class TestBoxWithNMSLimitOp(serial.SerializedTestCase):
             np.array(range(1, num_classes), dtype=np.float32),
             (gt_boxes.shape[0], 1)).T.flatten()
         if not output_classes_include_bg_cls:
-            # remove backgound class
+            # remove background class
             gt_classes -= 1
         gt_boxes = np.tile(gt_boxes, (num_classes - 1, 1))
         gt_scores = np.tile(gt_scores, (num_classes - 1, 1)).flatten()
diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
index 1e17a5315db..62b49295fd8 100644
--- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py
+++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
@@ -105,7 +105,7 @@ class TestElementwiseBroadcast(serial.SerializedTestCase):
         Y = np.random.rand(4, 5).astype(np.float32) + 2.0
 
         #two gradients Y*X^(Y-1) and X^Y * ln(X)
-        #latter gradient is sumed over 1 and 0 dims to account for broadcast
+        #latter gradient is summed over 1 and 0 dims to account for broadcast
         def powt_grad_broadcast(g_out, outputs, fwd_inputs):
             [GX, GY] = powt_grad(g_out, outputs, fwd_inputs)
             return ([GX, np.sum(np.sum(GY, 1), 0)])
@@ -127,7 +127,7 @@ class TestElementwiseBroadcast(serial.SerializedTestCase):
             return powt_op(X, Y[:, :, np.newaxis])
 
         #two gradients Y*X^(Y-1) and X^Y * ln(X)
-        #latter gradient is sumed over 3 and 0 dims to account for broadcast
+        #latter gradient is summed over 3 and 0 dims to account for broadcast
         def powt_grad_axis1(g_out, outputs, fwd_inputs):
             [X, Y] = fwd_inputs
             [GX, GY] = powt_grad(g_out, outputs, [X, Y[:, :, np.newaxis]])
@@ -150,7 +150,7 @@ class TestElementwiseBroadcast(serial.SerializedTestCase):
             return powt_op(X, Y[:, np.newaxis, np.newaxis, np.newaxis])
 
         #two gradients Y*X^(Y-1) and X^Y * ln(X)
-        #latter gradient is sumed over 3, 2 and 1 dims to account for broadcast
+        #latter gradient is summed over 3, 2 and 1 dims to account for broadcast
         def powt_grad_axis0(g_out, outputs, fwd_inputs):
             [X, Y] = fwd_inputs
             [GX, GY] = powt_grad(g_out,
@@ -175,7 +175,7 @@ class TestElementwiseBroadcast(serial.SerializedTestCase):
             return powt_op(X, Y[np.newaxis, :, :, :])
 
         #two gradients Y*X^(Y-1) and X^Y * ln(X)
-        #latter gradient is sumed over 0 and 1 dims to account for broadcast
+        #latter gradient is summed over 0 and 1 dims to account for broadcast
         def powt_grad_mixed(g_out, outputs, fwd_inputs):
             [X, Y] = fwd_inputs
             [GX, GY] = powt_grad(g_out, outputs, [X, Y[np.newaxis, :, :, :]])
diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py
index f2509dfc35a..12fafd911b1 100644
--- a/caffe2/python/operator_test/gather_ops_test.py
+++ b/caffe2/python/operator_test/gather_ops_test.py
@@ -13,7 +13,7 @@ import hypothesis.extra.numpy as hnp
 
 # Basic implementation of gather for axis == 0, shich is lookup of indices
 # in the outer dimension. Keeping it for reference here, although is similar
-# to more general funciton below.
+# to more general function below.
 def ref_gather_axis0():
     def inner(data, ind):
         if ind.size == 0 or data.shape[0] == 0:
diff --git a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
index 03807c50396..ae8c1dc2279 100644
--- a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
+++ b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
@@ -45,7 +45,7 @@ class TestHeatmapMaxKeypointOp(hu.HypothesisTestCase):
         # initial coordinates and interpolate HEATMAP_SIZE from it
         HEATMAP_SMALL_SIZE = 4
         bboxes_in = 500 * np.random.rand(NUM_TEST_ROI, 4).astype(np.float32)
-        # only bbox with smaller first coordiantes
+        # only bbox with smaller first coordinates
         for i in range(NUM_TEST_ROI):
             if bboxes_in[i][0] > bboxes_in[i][2]:
                 tmp = bboxes_in[i][2]
@@ -56,7 +56,7 @@ class TestHeatmapMaxKeypointOp(hu.HypothesisTestCase):
                 bboxes_in[i][3] = bboxes_in[i][1]
                 bboxes_in[i][1] = tmp
 
-        # initial randomized coordiantes for heatmaps and expand it with interpolation
+        # initial randomized coordinates for heatmaps and expand it with interpolation
         init = np.random.rand(
             NUM_TEST_ROI,
             NUM_KEYPOINTS,
diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py
index 19e6ee10e3d..7412c23e62b 100644
--- a/caffe2/python/operator_test/one_hot_ops_test.py
+++ b/caffe2/python/operator_test/one_hot_ops_test.py
@@ -69,7 +69,7 @@ class TestOneHotOps(serial.SerializedTestCase):
         lens = np.random.randint(low=1, high=5, size=d)
         boundaries = []
         for i in range(d):
-            # add [0, 0] as duplicated bounary for duplicated bucketization
+            # add [0, 0] as duplicated boundary for duplicated bucketization
             if lens[i] > 2:
                 cur_boundary = np.append(
                     np.random.randn(lens[i] - 2) * 5, [0, 0])
@@ -174,7 +174,7 @@ class TestOneHotOps(serial.SerializedTestCase):
         lens = np.random.randint(low=1, high=5, size=d)
         boundaries = []
         for i in range(d):
-            # add [0, 0] as duplicated bounary for duplicated bucketization
+            # add [0, 0] as duplicated boundary for duplicated bucketization
             if lens[i] > 2:
                 cur_boundary = np.append(
                     np.random.randn(lens[i] - 2) * 5, [0, 0])
diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py
index e49a0c21b70..33376e0d20f 100644
--- a/caffe2/python/operator_test/pooling_test.py
+++ b/caffe2/python/operator_test/pooling_test.py
@@ -131,7 +131,7 @@ class TestPooling(hu.HypothesisTestCase):
             assume(engine != "CUDNN")
         # some case here could be calculated with global pooling, but instead
         # calculated with general implementation, slower but should still
-        # be corect.
+        # be correct.
         op = core.CreateOperator(
             op_type,
             ["X"],
diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py
index acda05145f2..2e031e97904 100644
--- a/caffe2/python/operator_test/recurrent_network_test.py
+++ b/caffe2/python/operator_test/recurrent_network_test.py
@@ -265,7 +265,7 @@ class RecurrentNetworkTest(serial.SerializedTestCase):
         since there is no enough element of input_state sequence are available.
         So the initial_state for input_state contains several elements
         (exactly how many pads we need for the first step). Also, because of
-        that all offseting over input_state sequnece is being shifted
+        that all offseting over input_state sequence is being shifted
         by length of initial_input_state: see `link_offset` and `alias_offset`
         arguments of RecurrentNetwork.
 
diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py
index d09920d82fa..5b30da4387f 100644
--- a/caffe2/python/pipeline.py
+++ b/caffe2/python/pipeline.py
@@ -374,7 +374,7 @@ class ProcessingReader(Reader):
         # from it.
         with NetBuilder() as nb:
             # Current NetBuilder is optionally used inside the processor,
-            # then its children are retrived inside of
+            # then its children are retrieved inside of
             # normalize_processor_output.
             # Once readers and writers also use NetBuilder,
             # this logic will be more natural.
diff --git a/caffe2/python/regularizer.py b/caffe2/python/regularizer.py
index 97acc3caf94..9fe6ba1c340 100644
--- a/caffe2/python/regularizer.py
+++ b/caffe2/python/regularizer.py
@@ -150,7 +150,7 @@ class L0ApproxNorm(Regularizer):
         reg_lambda: parameter to scale regularization by
 
         alpha:      hyper parameter to tune that is only used in the calculation
-                    of approxiamte L0 norm
+                    of approximate L0 norm
 
         budget:     desired number of features. If the number of features is greater
                     than the budget amount, then the least important features will
diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py
index 108b749e8c3..1d49271805b 100644
--- a/caffe2/python/rnn_cell.py
+++ b/caffe2/python/rnn_cell.py
@@ -912,7 +912,7 @@ class MultiRNNCell(RNNCell):
     '''
     Multilayer RNN via the composition of RNNCell instance.
 
-    It is the resposibility of calling code to ensure the compatibility
+    It is the responsibility of calling code to ensure the compatibility
     of the successive layers in terms of input/output dimensiality, etc.,
     and to ensure that their blobs do not have name conflicts, typically by
     creating the cells with names that specify layer number.
diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py
index 67118e863c6..50fe136a5a1 100644
--- a/caffe2/python/schema.py
+++ b/caffe2/python/schema.py
@@ -185,7 +185,7 @@ class Field(object):
         )
 
     def _pprint_impl(self, indent, str_buffer):
-        raise NotImplementedError('Field is an abstrct class.')
+        raise NotImplementedError('Field is an abstract class.')
 
     def __repr__(self):
         str_buffer = StringIO()
diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py
index 489defea9ef..ce45ae3cb86 100644
--- a/caffe2/python/trt/transform.py
+++ b/caffe2/python/trt/transform.py
@@ -83,7 +83,7 @@ def transform_caffe2_net(
         debug_builder=False,
         build_serializable_op=True):
     """
-    Transfrom the caffe2_net by collapsing TRT-runnable nodes into trt c2 ops
+    Transform the caffe2_net by collapsing TRT-runnable nodes into trt c2 ops
     """
     check_gpu_()
 
diff --git a/caffe2/python/visualize.py b/caffe2/python/visualize.py
index 1eecdcdc583..626668841a6 100644
--- a/caffe2/python/visualize.py
+++ b/caffe2/python/visualize.py
@@ -115,7 +115,7 @@ class PatchVisualizer(object):
     def get_patch_shape(self, patch):
         """Gets the shape of a single patch.
 
-    Basically it tries to interprete the patch as a square, and also check if it
+    Basically it tries to interpret the patch as a square, and also check if it
     is in color (3 channels)
     """
         edgeLen = np.sqrt(patch.size)
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index d53da5b2110..9bd1c750b77 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -394,7 +394,7 @@ Int8Tensor = collections.namedtuple(
 
 def FetchInt8Blob(name):
     """Fetches an Int8 blob from the workspace. It shared backend implementation
-    with FetchBlob but it is recommened when fetching Int8 Blobs
+    with FetchBlob but it is recommended when fetching Int8 Blobs
 
     Inputs:
       name: the name of the Int8 blob - a string or a BlobReference
@@ -429,7 +429,7 @@ def FetchInt8BlobRealVal(name):
 
 def _Workspace_fetch_int8_blob(ws, name):
     """Fetches an Int8 blob from the workspace. It shared backend implementation
-    with FetchBlob but it is recommened when fetching Int8 Blobs
+    with FetchBlob but it is recommended when fetching Int8 Blobs
 
     Inputs:
       name: the name of the Int8 blob - a string or a BlobReference
diff --git a/caffe2/quantization/server/dnnlowp.cc b/caffe2/quantization/server/dnnlowp.cc
index 3e678a37adb..d7fc133ae9b 100644
--- a/caffe2/quantization/server/dnnlowp.cc
+++ b/caffe2/quantization/server/dnnlowp.cc
@@ -244,7 +244,7 @@ TensorQuantizationParams QuantizationFactory::ChooseQuantizationParams(
       return ChooseQuantizationParams(min, max, precision, preserve_sparsity);
     }
 
-    /** Ajust the granularity of histogram collection to
+    /** Adjust the granularity of histogram collection to
      * the quantization precision. Use 8x more number of bins
      * in the histogram should be sufficient for linear quantization.
      */
diff --git a/caffe2/quantization/server/norm_minimization.cc b/caffe2/quantization/server/norm_minimization.cc
index c0a74515ef0..9c11b4b9788 100644
--- a/caffe2/quantization/server/norm_minimization.cc
+++ b/caffe2/quantization/server/norm_minimization.cc
@@ -156,7 +156,7 @@ TensorQuantizationParams NormMinimization::NonlinearQuantizationParamsSearch(
     start_bin = next_start_bin;
     end_bin = next_end_bin;
   }
-  VLOG(2) << "best quantiation range " << start_bin << "," << end_bin + 1 << ","
+  VLOG(2) << "best quantization range " << start_bin << "," << end_bin + 1 << ","
           << norm_min;
 
   double selected_sum = 0;
diff --git a/caffe2/sgd/clip_tensor_op.cc b/caffe2/sgd/clip_tensor_op.cc
index c3de06ba749..c4e03ae84e6 100644
--- a/caffe2/sgd/clip_tensor_op.cc
+++ b/caffe2/sgd/clip_tensor_op.cc
@@ -24,7 +24,7 @@ OPERATOR_SCHEMA(ClipTensorByScaling)
     .Input(
         2,
         "additional_threshold",
-        "An optional additional threshold to scale the orignal threshold")
+        "An optional additional threshold to scale the original threshold")
     .Arg("threshold", "Threshold to determine whether to scale down the tensor")
     .Output(
         0,
diff --git a/caffe2/utils/map_utils.h b/caffe2/utils/map_utils.h
index 3a3d8ea4225..ef8ff0cab70 100644
--- a/caffe2/utils/map_utils.h
+++ b/caffe2/utils/map_utils.h
@@ -2,7 +2,7 @@
 
 namespace caffe2 {
 
-// Get value from map given key. Return suppiled default value if not found
+// Get value from map given key. Return supplied default value if not found
 // This is a stripped down version from folly:
 // https://github.com/facebook/folly/blob/5a07e203d79324b68d69f294fa38e43b9671e9b1/folly/MapUtil.h#L35-L45
 template <
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index 975b1a5ac30..274372e6789 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -1434,7 +1434,7 @@ C10_EXPORT void CopyMatrix<CPUContext>(
     return;
   }
   if (lda == N && ldb == N) {
-    // can coalese to a single memcpy of size M * N
+    // can coalesce to a single memcpy of size M * N
     if (copy) {
       copy(static_cast<const char*>(A), static_cast<char*>(B), N * M);
     } else {
diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h
index 7014b0cd118..3ea416298dc 100644
--- a/caffe2/utils/proto_utils.h
+++ b/caffe2/utils/proto_utils.h
@@ -270,7 +270,7 @@ class C10_EXPORT ArgumentHelper {
     if (arg_map_.at(name).has_s()) {
       CAFFE_ENFORCE(
           message.ParseFromString(arg_map_.at(name).s()),
-          "Faild to parse content from the string");
+          "Failed to parse content from the string");
     } else {
       VLOG(1) << "Return empty message for parameter " << name;
     }
@@ -284,7 +284,7 @@ class C10_EXPORT ArgumentHelper {
     for (int i = 0; i < messages.size(); ++i) {
       CAFFE_ENFORCE(
           messages[i].ParseFromString(arg_map_.at(name).strings(i)),
-          "Faild to parse content from the string");
+          "Failed to parse content from the string");
     }
     return messages;
   }
diff --git a/caffe2/utils/signal_handler.h b/caffe2/utils/signal_handler.h
index e61ae84b947..c773bdd4393 100644
--- a/caffe2/utils/signal_handler.h
+++ b/caffe2/utils/signal_handler.h
@@ -18,7 +18,7 @@ class CAFFE2_API SignalHandler {
     STOP
   };
 
-  // Contructor. Specify what action to take when a signal is received.
+  // Constructor. Specify what action to take when a signal is received.
   SignalHandler(Action SIGINT_action,
                 Action SIGHUP_action);
   ~SignalHandler();
diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h
index b4b129f3856..145dbc160b1 100644
--- a/caffe2/utils/threadpool/WorkersPool.h
+++ b/caffe2/utils/threadpool/WorkersPool.h
@@ -300,7 +300,7 @@ class alignas(kGEMMLOWPCacheLineSize) Worker {
     return nullptr;
   }
 
-  // Called by the master thead to give this worker work to do.
+  // Called by the master thread to give this worker work to do.
   // It is only legal to call this if the worker
   void StartWork(Task* task) {
     DCHECK(!task_.load());
diff --git a/caffe2/utils/zmq_helper.h b/caffe2/utils/zmq_helper.h
index bd45be9192d..05bc22a73c4 100644
--- a/caffe2/utils/zmq_helper.h
+++ b/caffe2/utils/zmq_helper.h
@@ -55,7 +55,7 @@ class ZmqSocket {
  public:
   explicit ZmqSocket(int type)
       : context_(1), ptr_(zmq_socket(context_.ptr(), type)) {
-    CAFFE_ENFORCE(ptr_ != nullptr, "Faild to create zmq socket.");
+    CAFFE_ENFORCE(ptr_ != nullptr, "Failed to create zmq socket.");
   }
 
   ~ZmqSocket() {
diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h
index 85046276115..f968b664dec 100644
--- a/caffe2/video/video_input_op.h
+++ b/caffe2/video/video_input_op.h
@@ -532,7 +532,7 @@ void VideoInputOp<Context>::GetLabelsFromProto(
     }
   } else {
     // For multiple label case, output label is a binary vector
-    // where presented concepts are makred 1
+    // where presented concepts are marked 1
     memset(label_data, 0, sizeof(int) * num_of_class_ * num_clips);
     for (int i = 0; i < num_clips; i++) {
       for (int j = 0; j < label_proto.int32_data_size(); j++) {
diff --git a/cmake/Modules/FindvecLib.cmake b/cmake/Modules/FindvecLib.cmake
index 46043367362..6b463b9e920 100644
--- a/cmake/Modules/FindvecLib.cmake
+++ b/cmake/Modules/FindvecLib.cmake
@@ -1,4 +1,4 @@
-# Find the vecLib libraries as part of Accelerate.framework or as standalon framework
+# Find the vecLib libraries as part of Accelerate.framework or as standalone framework
 #
 # The following are set after configuration is done:
 #  VECLIB_FOUND
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 9a5adc1f6da..d75dabfd459 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -403,7 +403,7 @@ torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
 list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
 message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA}")
 
-# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
+# disable some nvcc diagnostic that appears in boost, glog, glags, opencv, etc.
 foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
   list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag})
 endforeach()
diff --git a/docs/source/hub.rst b/docs/source/hub.rst
index c6b14d312b5..29f2959125e 100644
--- a/docs/source/hub.rst
+++ b/docs/source/hub.rst
@@ -100,7 +100,7 @@ Note that ``*args, **kwargs`` in ``torch.load()`` are used to **instantiate** a
 After you loaded a model, how can you find out what you can do with the model?
 A suggested workflow is
 
-- ``dir(model)`` to see all avaialble methods of the model.
+- ``dir(model)`` to see all available methods of the model.
 - ``help(model.foo)`` to check what arguments ``model.foo`` takes to run
 
 To help users explore without referring to documentation back and forth, we strongly
diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 3202068ba88..13925ad5490 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -749,7 +749,7 @@ Attributes
 The TorchScript compiler needs to know the types of `module attributes`_. Most types
 can be inferred from the value of the member. Empty lists and dicts cannot have their
 types inferred and must have their types annotated with `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_ class annotations.
-If a type cannot be inferred and is not explicilty annotated, it will not be added as an attribute
+If a type cannot be inferred and is not explicitly annotated, it will not be added as an attribute
 to the resulting :class:`ScriptModule`
 
 
diff --git a/docs/source/notes/extending.rst b/docs/source/notes/extending.rst
index 191ac5241a6..d0a4f920b92 100644
--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@@ -45,7 +45,7 @@ encode the operation history. Every new function requires you to implement 2 met
   the autograd engine.
 
   - :meth:`~torch.autograd.function._ContextMethodMixin.save_for_backward` must be
-    used when saving input or ouput of the forward to be used later in the backward.
+    used when saving input or output of the forward to be used later in the backward.
   - :meth:`~torch.autograd.function._ContextMethodMixin.mark_dirty` must be used to
     mark any input that is modified inplace by the forward function.
   - :meth:`~torch.autograd.function._ContextMethodMixin.mark_non_differentiable` must
diff --git a/docs/source/notes/windows.rst b/docs/source/notes/windows.rst
index 954f1e2e17a..a003816855c 100644
--- a/docs/source/notes/windows.rst
+++ b/docs/source/notes/windows.rst
@@ -78,7 +78,7 @@ object to make it build on Windows.
        relative_to=__file__,
        with_cuda=with_cuda,
        extra_compile_args=["-std=c99"],
-       libraries=['ATen', '_C'] # Append cuda libaries when necessary, like cudart
+       libraries=['ATen', '_C'] # Append cuda libraries when necessary, like cudart
    )
 
 Second, here is a workground for "unresolved external symbol 
@@ -176,7 +176,7 @@ You can resolve this by typing the following command.
     conda install -c peterjc123 vc vs2017_runtime
     conda install mkl_fft intel_openmp numpy mkl
 
-As for the wheels package, since we didn't pack some libaries and VS2017 
+As for the wheels package, since we didn't pack some libraries and VS2017 
 redistributable files in, please make sure you install them manually.
 The `VS 2017 redistributable installer
 <https://aka.ms/vs/15/release/VC_redist.x64.exe>`_ can be downloaded.
diff --git a/ios/TestApp/benchmark/setup.rb b/ios/TestApp/benchmark/setup.rb
index 4220aa3a894..bfda3f884a4 100644
--- a/ios/TestApp/benchmark/setup.rb
+++ b/ios/TestApp/benchmark/setup.rb
@@ -5,7 +5,7 @@ require 'optparse'
 options = {}
 option_parser = OptionParser.new do |opts|
  opts.banner = 'Script for setting up TestApp.xcodeproj'
- opts.on('-t', '--team_id ', 'developemnt team ID') { |value|
+ opts.on('-t', '--team_id ', 'development team ID') { |value|
     options[:team_id] = value
  }
 end.parse!
diff --git a/modules/detectron/group_spatial_softmax_op.cu b/modules/detectron/group_spatial_softmax_op.cu
index 7caca86bd2b..92e89ae5acc 100644
--- a/modules/detectron/group_spatial_softmax_op.cu
+++ b/modules/detectron/group_spatial_softmax_op.cu
@@ -25,7 +25,7 @@ namespace {
 
 __global__ void GroupSpatialSoftmaxKernel(const int num, const int A, const int W,
     const int H, const float* Xdata, float* Pdata, const int num_classes) {
-  // Loop throuh labels (N x A x H x W)
+  // Loop through labels (N x A x H x W)
   CUDA_1D_KERNEL_LOOP(index, num * A * H * W) {
     int D = num_classes * A;
     int x = index % W;
diff --git a/modules/detectron/select_smooth_l1_loss_op.cc b/modules/detectron/select_smooth_l1_loss_op.cc
index 502be37f9b2..7f1441032ac 100644
--- a/modules/detectron/select_smooth_l1_loss_op.cc
+++ b/modules/detectron/select_smooth_l1_loss_op.cc
@@ -52,7 +52,7 @@ tensor that encodes bounding box regression predictions.
         2,
         "locations",
         "2D tensor of shape (M, 4) that identifies M 'select' locations "
-        "encoded by the four colums: (n, c, y, x). The loss is computed on the "
+        "encoded by the four columns: (n, c, y, x). The loss is computed on the "
         "four contiguous channel locations [c, c + 3] (inclusive).")
     .Input(
         3,
diff --git a/modules/detectron/softmax_focal_loss_op.cu b/modules/detectron/softmax_focal_loss_op.cu
index 611fa6932e3..93635269f17 100644
--- a/modules/detectron/softmax_focal_loss_op.cu
+++ b/modules/detectron/softmax_focal_loss_op.cu
@@ -146,7 +146,7 @@ template <>
 bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0);         // Logits
   auto& T = Input(1);         // Labels
-  auto& wp = Input(2);        // num of foregound
+  auto& wp = Input(2);        // num of foreground
    // average loss as output
           // softmax probability, going to be re-used in gradient
 
diff --git a/scripts/xcode_build.rb b/scripts/xcode_build.rb
index fa97407beef..8faae2da1f5 100644
--- a/scripts/xcode_build.rb
+++ b/scripts/xcode_build.rb
@@ -16,7 +16,7 @@ option_parser = OptionParser.new do |opts|
  opts.on('-c', '--provisioning_profile ', 'provisioning profile for code signing') { |value|
     options[:profile] = value
  }
- opts.on('-t', '--team_id ', 'developemnt team ID') { |value|
+ opts.on('-t', '--team_id ', 'development team ID') { |value|
     options[:team_id] = value
  }
 end.parse!
diff --git a/test/common_device_type.py b/test/common_device_type.py
index 687aaf89d05..41d478bbca2 100644
--- a/test/common_device_type.py
+++ b/test/common_device_type.py
@@ -125,7 +125,7 @@ device_type_test_bases = []
 class DeviceTypeTestBase(TestCase):
     device_type = 'generic_device_type'
 
-    # Precision is a thread-local setting since it may be overriden per test
+    # Precision is a thread-local setting since it may be overridden per test
     _tls = threading.local()
     _tls.precision = TestCase.precision
 
@@ -460,7 +460,7 @@ class precisionOverride(object):
 # Decorator that instantiates a variant of the test for each given dtype.
 # Notes:
 #   (1) Tests that accept the dtype argument MUST use this decorator.
-#   (2) Can be overriden for the CPU or CUDA, respectively, using dtypesIfCPU
+#   (2) Can be overridden for the CPU or CUDA, respectively, using dtypesIfCPU
 #       or dtypesIfCUDA.
 #   (3) Prefer the existing decorators to defining the 'device_type' kwarg.
 class dtypes(object):
diff --git a/test/common_distributed.py b/test/common_distributed.py
index a09c1b941bc..bcf0ad6ca94 100644
--- a/test/common_distributed.py
+++ b/test/common_distributed.py
@@ -194,7 +194,7 @@ class MultiProcessTestCase(TestCase):
         self.file_name = file_name
 
         # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
-        # We're retreiving a corresponding test and executing it.
+        # We're retrieving a corresponding test and executing it.
         getattr(self, test_name)()
         # exit to avoid run teardown() for fork processes
         sys.exit(0)
diff --git a/test/common_utils.py b/test/common_utils.py
index 1d30bd43c6b..0722a3ec6bd 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -355,7 +355,7 @@ def skipIfNotRegistered(op_name, message):
 
     Args:
         op_name: Check if this op is registered in `core._REGISTERED_OPERATORS`.
-        message: mesasge to fail with.
+        message: message to fail with.
 
     Usage:
         @skipIfNotRegistered('MyOp', 'MyOp is not linked!')
diff --git a/test/cpp/api/rnn.cpp b/test/cpp/api/rnn.cpp
index 1dce851264f..c17a0ab9c60 100644
--- a/test/cpp/api/rnn.cpp
+++ b/test/cpp/api/rnn.cpp
@@ -256,7 +256,7 @@ void copyParameters(torch::nn::ModuleHolder<Impl>& target, size_t t_i,
 
 // This test is a port of python code introduced here:
 // https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
-// Reverse forward of bidrectional GRU should act
+// Reverse forward of bidirectional GRU should act
 // as regular forward of unidirectional GRU
 void BidirectionalGRUReverseForward(bool cuda) {
   auto opt = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false)
@@ -307,7 +307,7 @@ TEST_F(RNNTest, BidirectionalGRUReverseForward_CUDA) {
   BidirectionalGRUReverseForward(true);
 }
 
-// Reverse forward of bidrectional LSTM should act
+// Reverse forward of bidirectional LSTM should act
 // as regular forward of unidirectional LSTM
 void BidirectionalLSTMReverseForwardTest(bool cuda) {
   auto opt = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false)
diff --git a/test/cpp/jit/test_irparser.cpp b/test/cpp/jit/test_irparser.cpp
index b4b5dc40cfe..6d8d72507bc 100644
--- a/test/cpp/jit/test_irparser.cpp
+++ b/test/cpp/jit/test_irparser.cpp
@@ -125,7 +125,7 @@ graph(%a):
     AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get()));
   }
   {
-    // Check that parser corectly handles values reusing the same name.
+    // Check that parser correctly handles values reusing the same name.
     auto graph = std::make_shared<Graph>();
     script::parseIR(
         R"IR(
diff --git a/test/dist_autograd_test.py b/test/dist_autograd_test.py
index 3cce6041275..04366e6fd3b 100644
--- a/test/dist_autograd_test.py
+++ b/test/dist_autograd_test.py
@@ -289,7 +289,7 @@ class DistAutogradTest(RpcAgentTestFixture):
 
         # For send function when making nest rpc call,
         # next functions of the send function are two recv functions
-        # for recevied two tensors from previous call
+        # for received two tensors from previous call
         next_funcs = list(send_functions.values())[0].next_functions
         self.assertEqual(2, len(next_funcs))
         self.assertEqual(
diff --git a/test/hypothesis_utils.py b/test/hypothesis_utils.py
index d54e1056bc1..4180ba4ab2d 100644
--- a/test/hypothesis_utils.py
+++ b/test/hypothesis_utils.py
@@ -125,7 +125,7 @@ Args:
     min_side / max_side: minimum and maximum dimensions per rank.
 
 Generates:
-    Possibe shapes for a tensor, constrained to the rank and dimensionality.
+    Possible shapes for a tensor, constrained to the rank and dimensionality.
 
 Example:
     # Generates 3D and 4D tensors.
diff --git a/test/jit/test_data_parallel.py b/test/jit/test_data_parallel.py
index 255a128ec16..a4c6b9b741f 100644
--- a/test/jit/test_data_parallel.py
+++ b/test/jit/test_data_parallel.py
@@ -118,7 +118,7 @@ class TestDataParallel(JitTestCase):
         r0_forward = replica[0].forward(x)
         self.assertEqual(second_forward, r0_forward)
 
-        # replca which is on a different GPU has a deep copy of the original
+        # replica which is on a different GPU has a deep copy of the original
         # params and buffers
         x1 = torch.ones(2, 2, requires_grad=True).cuda(device=1)
         r1_forward = replica[1].forward(x1)
diff --git a/test/onnx/debug_embed_params.py b/test/onnx/debug_embed_params.py
index 14470313cba..9b3f14df0fd 100644
--- a/test/onnx/debug_embed_params.py
+++ b/test/onnx/debug_embed_params.py
@@ -42,7 +42,7 @@ def run_embed_params(proto, model, input, state_dict=None, use_gpu=True):
         # TODO: Even better: keyword arguments!
         for k in model.state_dict():
             if k not in state_dict:
-                # Once PyTorch Module adds unnecessary paramter, the old pre-trained model does not have it.
+                # Once PyTorch Module adds unnecessary parameter, the old pre-trained model does not have it.
                 # Just simply pass the new one.
                 # TODO: Please don't export unnecessary parameter.
                 parameters.append(model.state_dict()[k])
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 4fd953a5067..35021ad1cb1 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -337,7 +337,7 @@ class TestCaffe2Backend_opset9(unittest.TestCase):
 
         # test that the model still runs with a different batch size
         # (save the model with a batch_size of 1 with rnn with a variable batch size,
-        # othewise expand will fail)
+        # otherwise expand will fail)
         variable_batch_size_init_input = make_input(1)
         # Constant folding works when model has parameters embedded. For this case, we need to disable it
         onnxir, _ = do_export(model, variable_batch_size_init_input, keep_initializers_as_inputs=True,
@@ -382,7 +382,7 @@ class TestCaffe2Backend_opset9(unittest.TestCase):
 
         # test that the model still runs with a different batch size
         # (save the model with a batch_size of 1 with rnn with a variable batch size,
-        # othewise expand will fail)
+        # otherwise expand will fail)
         variable_batch_size_init_input = make_input(1)
         # Constant folding works when model has parameters embedded. For this case, we need to disable it
         onnxir, _ = do_export(model, variable_batch_size_init_input, keep_initializers_as_inputs=True,
@@ -425,7 +425,7 @@ class TestCaffe2Backend_opset9(unittest.TestCase):
 
         # test that the model still runs with a different batch size
         # (save the model with a batch_size of 1 with rnn with a variable batch size,
-        # othewise expand will fail)
+        # otherwise expand will fail)
         variable_batch_size_init_input = make_input(1)
         # Constant folding works when model has parameters embedded. For this case, we need to disable it
         onnxir, _ = do_export(model, variable_batch_size_init_input, keep_initializers_as_inputs=True,
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 51655acaea3..8f7c582c9c3 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -83,7 +83,7 @@ def run_model_test(self, model, batch_size=2, state_dict=None,
         input_copy = copy.deepcopy(input)
         ort_test_with_input(ort_sess, input_copy, output, rtol, atol)
 
-        # if addiional test inputs are provided run the onnx
+        # if additional test inputs are provided run the onnx
         # model with these inputs and check the outputs
         if test_with_inputs is not None:
             for test_input in test_with_inputs:
diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py
index dfd30765c5f..f6dd54c6c6e 100644
--- a/test/test_cpp_extensions.py
+++ b/test/test_cpp_extensions.py
@@ -882,13 +882,13 @@ class TestMSNPUTensor(common.TestCase):
         weight = torch.empty(6, 4, 2, 2, device='msnpu', requires_grad=True)
         bias = torch.empty(6, device='msnpu')
 
-        # Make sure forward is overriden
+        # Make sure forward is overridden
         out = torch.nn.functional.conv1d(input, weight, bias, 2, 0, 1, 1)
         self.assertEqual(msnpu_extension.get_test_int(), 2)
         self.assertEqual(out.shape[0], input.shape[0])
         self.assertEqual(out.shape[1], weight.shape[0])
 
-        # Make sure backward is overriden
+        # Make sure backward is overridden
         # Double backward is dispatched to _convolution_double_backward.
         # It is not tested here as it involves more computation/overrides.
         grad = torch.autograd.grad(out, input, out, create_graph=True)
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 8b3265a2b7f..3dc4d5fbce8 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -80,7 +80,7 @@ except ImportError:
 
 def pairwise(Dist, *params):
     """
-    Creates a pair of distributions `Dist` initialzed to test each element of
+    Creates a pair of distributions `Dist` initialized to test each element of
     param with each other.
     """
     params1 = [torch.tensor([p] * len(p)) for p in params]
diff --git a/test/test_numba_integration.py b/test/test_numba_integration.py
index 832eee73cd2..a8b29e8dcf6 100644
--- a/test/test_numba_integration.py
+++ b/test/test_numba_integration.py
@@ -299,7 +299,7 @@ class TestNumbaIntegration(common.TestCase):
                 torch_ary += 42
                 self.assertEqual(torch_ary.data.numpy(), numpy.asarray(numba_ary) + 42)
 
-            # Explict-copy when using `torch.tensor()`
+            # Explicit-copy when using `torch.tensor()`
             for numpy_ary in numpy_arys:
                 numba_ary = numba.cuda.to_device(numpy_ary)
                 torch_ary = torch.tensor(numba_ary, device="cuda")
diff --git a/test/test_utils.py b/test/test_utils.py
index 61df1651d17..7af65ed0002 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -368,7 +368,7 @@ class TestBottleneck(TestCase):
 
     def _check_environment_summary(self, output):
         results = re.search('Environment Summary', output)
-        self.assertIsNotNone(results, self._fail_msg('Should have Enviroment Summary', output))
+        self.assertIsNotNone(results, self._fail_msg('Should have Environment Summary', output))
 
         # Up to five lines away from the heading, there should be the version number
         results = re.search(r'Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+', output)
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index 6d4d9c8436b..ef0fe9bfe5b 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -55,7 +55,7 @@ VIEW_FUNCTIONS = {
     'indices': 'self',
     'values': 'self',
     # sparse_coo ctor output should really be views of both indices and values,
-    # but we only supports making as view of a single varible, and indices is
+    # but we only supports making as view of a single variable, and indices is
     # discrete anyways.
     # FIXME: clone indices on construction.
     'sparse_coo_tensor_with_dims_and_tensors': 'values',
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 2f76f183a1b..b6e0e2c9a3e 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -409,7 +409,7 @@ def create_python_bindings(python_functions, has_self, is_module=False):
 
         def unpack_variable(name, unpack_expr, typename):
             # optional<ArrayRef<T>> are special. The PythonArgParser returns an
-            # optional<vector<T>>, which cannot be implictly converted to
+            # optional<vector<T>>, which cannot be implicitly converted to
             # optional<ArrayRef<T>>. One needs to unwrap the optional and rewrap.
             if typename == 'c10::optional<DimnameList>':
                 result = """\
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 9887fb03af3..989615ad14b 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -990,7 +990,7 @@ def unpack_args(env, declaration):
             ))
         else:
             # Okay, we are abusing the definition of 'unpack' here a bit,
-            # although it's stll getting the non-variable from the variable
+            # although it's still getting the non-variable from the variable
             # (in this case via TensorOptions rather than Variable/Tensor).
             body.append(UNPACK_OPTIONS.substitute(arg_name=arg['name']))
 
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index aa033f2c4a4..5400d08f4e3 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -323,7 +323,7 @@ def gen_nn_modules(out):
     def replace_forward(m):
         # We instruct mypy to not emit errors for the `forward` and `__call__` declarations since mypy
         # would otherwise correctly point out that Module's descendants' `forward` declarations
-        # conflict with `Module`s. Specificlaly, `Module` defines `forward(self, *args)` while the
+        # conflict with `Module`s. Specifically, `Module` defines `forward(self, *args)` while the
         # descandantes define more specific forms, such as `forward(self, input: Tensor)`, which
         # violates Liskov substitutability. The 'mypy' team recommended this solution for now.
         forward_def = m.group(0) + "  # type: ignore"
diff --git a/torch/autograd/__init__.pyi b/torch/autograd/__init__.pyi
index c80c1e5afdd..4c183d7d48a 100644
--- a/torch/autograd/__init__.pyi
+++ b/torch/autograd/__init__.pyi
@@ -26,7 +26,7 @@ class NestedIOFunction(Function):
 
 # 'func' accepts a vararg of tensors, which isn't expressable in the type system at the moment.
 # If https://mypy.readthedocs.io/en/latest/additional_features.html?highlight=callable#extended-callable-types is accepted,
-# the '...' first argument of Callabe can be replaced with VarArg(Tensor).
+# the '...' first argument of Callable can be replaced with VarArg(Tensor).
 # For now, we permit any input.
 def gradcheck(func: Callable[..., Union[Tensor, Tuple[Tensor, ...]]], inputs: Union[Tensor, Tuple[Tensor, ...]], eps: float=..., atol: float=..., rtol: float=..., raise_exception: bool=..., check_sparse_nnz: bool=...) -> bool: ...
 def gradgradcheck(func: Callable[..., Union[Tensor, Tuple[Tensor, ...]]], inputs: Union[Tensor, Tuple[Tensor, ...]], eps: float=..., atol: float=..., rtol: float=..., gen_non_contig_grad_outputs: bool=..., raise_exception: bool=...) -> bool: ...
diff --git a/torch/csrc/api/include/torch/data/datasets/base.h b/torch/csrc/api/include/torch/data/datasets/base.h
index 9d5ca154b83..9cd318df522 100644
--- a/torch/csrc/api/include/torch/data/datasets/base.h
+++ b/torch/csrc/api/include/torch/data/datasets/base.h
@@ -93,7 +93,7 @@ class Dataset : public BatchDataset<Self, std::vector<SingleExample>> {
   }
 };
 
-/// A `StreamDataset` reprsents a dataset that is a potentially infinite stream.
+/// A `StreamDataset` represents a dataset that is a potentially infinite stream.
 /// It takes as batch index only a number, which is the batch size, and yields
 /// that many elements from the stream.
 template <typename Self, typename Batch = std::vector<Example<>>>
diff --git a/torch/csrc/api/include/torch/data/datasets/stateful.h b/torch/csrc/api/include/torch/data/datasets/stateful.h
index ceab6e95247..8ebbc7eaf2c 100644
--- a/torch/csrc/api/include/torch/data/datasets/stateful.h
+++ b/torch/csrc/api/include/torch/data/datasets/stateful.h
@@ -27,7 +27,7 @@ namespace datasets {
 ///
 /// Note that when subclassing a from `StatefulDataset<Self, T>`, the return
 /// type of `get_batch()`, which the subclass must override, will be
-/// `optional<T>` (i.e. the type specified in the `StatefulDataset` specialization is automatically boxed into an `optional` for the datast's `BatchType`).
+/// `optional<T>` (i.e. the type specified in the `StatefulDataset` specialization is automatically boxed into an `optional` for the dataset's `BatchType`).
 template <
     typename Self,
     typename Batch = std::vector<Example<>>,
diff --git a/torch/csrc/api/include/torch/data/iterator.h b/torch/csrc/api/include/torch/data/iterator.h
index 2ba1a5d33df..9e17109045f 100644
--- a/torch/csrc/api/include/torch/data/iterator.h
+++ b/torch/csrc/api/include/torch/data/iterator.h
@@ -18,7 +18,7 @@ namespace detail {
 // `Iterator` consists of a `ValidIterator` and a `SentinelIterator`. A
 // `ValidIterator` yields new batches until the `DataLoader` is exhausted. While
 // the `DataLoader` is not exhausted, `ValidIterator`s compare equal if they are
-// the same object. When the `ValidIterator` becomes exhauted, it compares equal
+// the same object. When the `ValidIterator` becomes exhausted, it compares equal
 // to the `SentinelIterator`, but not before. Half the code here is to implement
 // double dispatch for the comparison. Got damnit, C++.
 
diff --git a/torch/csrc/api/include/torch/nn/modules/container/sequential.h b/torch/csrc/api/include/torch/nn/modules/container/sequential.h
index 3e43edad048..113d62eb254 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/sequential.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/sequential.h
@@ -178,7 +178,7 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
       input = iterator->any_forward(std::move(input));
     }
 
-    // Check the return value and give a nice error message if the requsted
+    // Check the return value and give a nice error message if the requested
     // return type was incorrect.
     if (auto* return_value = input.template try_get<ReturnType>()) {
       return std::move(*return_value);
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 367cb294ee4..557cbd46fa0 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -195,7 +195,7 @@ void enableProfiler(ProfilerConfig config) {
 
   if(state == ProfilerState::CUDA) {
     // event recording appears to have some startup overhead, so we need to
-    // to generate some dummy events first before recording syncrhonization events
+    // to generate some dummy events first before recording synchronization events
     for(int i = 0; i < 5; i++) {
       cuda_stubs->onEachDevice([](int d) {
           mark("__cuda_startup");
diff --git a/torch/csrc/distributed/autograd/context/container.h b/torch/csrc/distributed/autograd/context/container.h
index d129f2e736b..bee785fa512 100644
--- a/torch/csrc/distributed/autograd/context/container.h
+++ b/torch/csrc/distributed/autograd/context/container.h
@@ -13,7 +13,7 @@ namespace autograd {
 // autograd context for each autograd pass and also cleans up data for an
 // autograd pass once its done.
 //
-// Each autograd pass is assinged a unique autograd_context_id and all data for
+// Each autograd pass is assigned a unique autograd_context_id and all data for
 // that pass (DistAutogradContext) is stored in this container indexed by the
 // autograd_context_id. The autograd_context_id itself is a 64 bit globally
 // unique id. The first 16 bits is the worker_id and the next 48 bits is an
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
index 54933249cd2..f88b654c964 100644
--- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
@@ -307,7 +307,7 @@ void DistEngine::execute(const variable_list& roots) {
   // functions.
   {
     std::lock_guard<std::mutex> guard(initializedContextIdsLock_);
-    // Context should not have been intialized already.
+    // Context should not have been initialized already.
     TORCH_INTERNAL_ASSERT(
         initializedContextIds_.find(autogradContext->contextId()) ==
         initializedContextIds_.end());
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index b079ac6d52a..336a589ad14 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -723,7 +723,7 @@ void Reducer::finalize_backward() {
   // Due to the lazy wait, it is possible that reduction of the current
   // iteration is still going when the one for next iteration gets kicked off.
   // For such case, we want to wait explicitly to make sure the reduction does
-  // complete before kicking off next one. Otherwise the prevous one may
+  // complete before kicking off next one. Otherwise the previous one may
   // interfere, write to the device-side memory and clobber the content of
   // local_unused_maps_dev_.
   if (!local_used_maps_reduced_) {
diff --git a/torch/csrc/distributed/rpc/process_group_agent.cpp b/torch/csrc/distributed/rpc/process_group_agent.cpp
index 3644c4e663f..4341ce68ef8 100644
--- a/torch/csrc/distributed/rpc/process_group_agent.cpp
+++ b/torch/csrc/distributed/rpc/process_group_agent.cpp
@@ -275,7 +275,7 @@ std::shared_ptr<FutureMessage> ProcessGroupAgent::send(
       }
     }
     if (notifyThread) {
-      // Notify the wathdog thread only after releasing the lock,
+      // Notify the watchdog thread only after releasing the lock,
       // so watchdog can acquire lock on waking up.
       futureTimeoutCV_.notify_one();
     }
diff --git a/torch/csrc/distributed/rpc/python_rpc_handler.h b/torch/csrc/distributed/rpc/python_rpc_handler.h
index 9dddb246f1a..d2615a798b3 100644
--- a/torch/csrc/distributed/rpc/python_rpc_handler.h
+++ b/torch/csrc/distributed/rpc/python_rpc_handler.h
@@ -49,7 +49,7 @@ class PYBIND11_EXPORT PythonRpcHandler {
   // Our local tests also caught this segment faults if py::objects are cleaned
   // up at program exit. The explanation is: CPython cleans up most critical
   // utilities before cleaning up PythonRpcHandler singleton, so when
-  // PythonRpcHandler signleton cleans up py::objects and call dec_ref(), it
+  // PythonRpcHandler singleton cleans up py::objects and call dec_ref(), it
   // will crash.
   // The solution is to clean up py::objects earlier when Rpc agent join().
   // Be note that py::objects can not be cleaned up when Rpc agent is destroyed
diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h
index a6b87748b26..c1b05768b8d 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@@ -138,10 +138,10 @@ class TORCH_API RpcAgent {
   // Retrieve the default rpc agent.
   static std::shared_ptr<RpcAgent> getDefaultRpcAgent();
 
-  // Retrive metrics as KV map
+  // Retrieve metrics as KV map
   virtual std::unordered_map<std::string, std::string> getMetrics() = 0;
 
-  // Retrive debug info in addition to metrics as KV map
+  // Retrieve debug info in addition to metrics as KV map
   virtual std::unordered_map<std::string, std::string> getDebugInfo() = 0;
 
  protected:
diff --git a/torch/csrc/distributed/rpc/rref_context.h b/torch/csrc/distributed/rpc/rref_context.h
index a01e54ceec0..2f5a920c522 100644
--- a/torch/csrc/distributed/rpc/rref_context.h
+++ b/torch/csrc/distributed/rpc/rref_context.h
@@ -164,7 +164,7 @@ class RRefContext {
 
   // The follow two maps keep UserRRefs alive by holding a shared_ptr to the
   // RRef instances. A UserRRef must be added into this map if any of the
-  // following two conditions is ture:
+  // following two conditions is true:
   //
   // (1) A UserRRef has not been accepted by owner yet.
   //
diff --git a/torch/csrc/distributed/rpc/utils.h b/torch/csrc/distributed/rpc/utils.h
index f1c4d574be0..989828b6523 100644
--- a/torch/csrc/distributed/rpc/utils.h
+++ b/torch/csrc/distributed/rpc/utils.h
@@ -29,7 +29,7 @@ TORCH_API std::pair<std::vector<char>, std::vector<at::Tensor>> wireDeserialize(
 // Some Tensors are effectively views of larger Tensors, where only a small
 // subset of the Storage data is referenced. This normally is good and avoids
 // copies when kept locally, but if we naively push the whole Storage over the
-// wire, we'll end up with excess network trafic. This change clones tensors if
+// wire, we'll end up with excess network traffic. This change clones tensors if
 // we'd save at least half the data, and over a minimum hurdle.
 TORCH_API c10::List<at::Tensor> cloneSparseTensors(
     const std::vector<at::Tensor>& tensors);
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index 0fb47c8612d..0ed097f90c0 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -179,7 +179,7 @@ c10::optional<IValue> toIValue(const Value* v) {
   if (v->node()->kind() != prim::Constant || v->type()->cast<FunctionType>()) {
     return c10::nullopt;
   }
-  // use implemenation of prim::Constant to compute the output IValue
+  // use implementation of prim::Constant to compute the output IValue
   auto op = getOperation(v->node());
   Stack stack;
   op(stack);
diff --git a/torch/csrc/jit/constants.h b/torch/csrc/jit/constants.h
index 86630aef001..4bf8bf8bc67 100644
--- a/torch/csrc/jit/constants.h
+++ b/torch/csrc/jit/constants.h
@@ -43,7 +43,7 @@ TORCH_API c10::optional<Value*> tryInsertConstant(
 // Helper for retrieving constants
 ////////////////////////////////////////////////////////////////////////////////
 
-// attempt to convert a (possibly constant) Value* into an intepreter value
+// attempt to convert a (possibly constant) Value* into an interpreter value
 // (IValue). returns c10::nullopt if the Value* was not constant
 TORCH_API c10::optional<IValue> toIValue(const Value* v);
 
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index d9b10ba2efa..a967554bb78 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -120,7 +120,7 @@ class EncoderBase {
 
  protected:
   // Using std::map instead of std::unordered_map for initializers
-  // in EncodeGraph cosntructor so that the order in which initializers
+  // in EncodeGraph constructor so that the order in which initializers
   // get written to the ONNX graph is always the deterministic and
   // predictable. While this is not a ONNX requirement, it is needed
   // for testing purposes in tests that use _export_to_pretty_string()
diff --git a/torch/csrc/jit/export_module.cpp b/torch/csrc/jit/export_module.cpp
index 0fa3c4d476a..60162408a62 100644
--- a/torch/csrc/jit/export_module.cpp
+++ b/torch/csrc/jit/export_module.cpp
@@ -115,7 +115,7 @@ class ScriptModuleSerializer {
       convertNamedType(class_deps_[i]);
     }
 
-    // Mapping of filename => src. We need this because multiple clases may go
+    // Mapping of filename => src. We need this because multiple classes may go
     // in the same file (e.g. foo.bar.Baz and foo.bar.Qux)
     for (auto& item : file_streams_) {
       const std::string filename = qualifierToArchivePath(item.key(), "code/");
diff --git a/torch/csrc/jit/fuser/codegen.cpp b/torch/csrc/jit/fuser/codegen.cpp
index 4aff18ea0f6..b4b893b030a 100644
--- a/torch/csrc/jit/fuser/codegen.cpp
+++ b/torch/csrc/jit/fuser/codegen.cpp
@@ -509,7 +509,7 @@ std::string generateKernel(
     env.s("RandInit", "");
   }
 
-  // Insantiates the CUDA or CPU-specific templates
+  // Instantiates the CUDA or CPU-specific templates
   env.s("tensorOffsets", tensorOffsets.str());
   env.s("kernelBody", body.str());
   env.v("formals", formals);
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index 643c3923f69..819bbcf5bce 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -581,7 +581,7 @@ struct GraphExecutorImpl : public GraphExecutorImplBase {
     // Phase 4. If this graph will be differentiated, we need to slice out the
     //          symbolically differentiable subgraphs for further optimizations.
     // Phase 5. Apply non-differentiable optimizations to the graphs we've found
-    //          (or the whole grpah if we know we won't need its derivative).
+    //          (or the whole graph if we know we won't need its derivative).
     if (needsGradient(opt_graph)) {
       auto diff_nodes = CreateAutodiffSubgraphs(
           opt_graph,
diff --git a/torch/csrc/jit/import_source.cpp b/torch/csrc/jit/import_source.cpp
index 1d60d718510..70918d96bf8 100644
--- a/torch/csrc/jit/import_source.cpp
+++ b/torch/csrc/jit/import_source.cpp
@@ -297,7 +297,7 @@ struct SourceImporterImpl : public Resolver,
                 continue;
               } else {
                 if (assign.rhs().present()) {
-                  // This is a constant assignemnt, of the form:
+                  // This is a constant assignment, of the form:
                   // foo : Final[int] = 3
                   constants.push_back(assign);
                 } else {
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index 30affaba838..6cbec865072 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -636,7 +636,7 @@ void initJITBindings(PyObject* module) {
         // Run the user-supplied function
         py_func_output = f(*args_tup);
 
-        // Convert the output of the user-supplied funciton to IValue. The type
+        // Convert the output of the user-supplied function to IValue. The type
         // information of this IValue is used both to record the correct type in
         // the trace.
         output_ivalue = toTypeInferredIValue(py_func_output);
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index e9401bf6b63..7b0f772d0b0 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -30,7 +30,7 @@
 
 // Note that currently the backward compatibility is not supported by bytecode.
 // This format and process need to be revisted and redesigned if we want to
-// suppot backward compatibility in future.
+// support backward compatibility in future.
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
index cc8aaf2dbf8..c79f3cc28bf 100644
--- a/torch/csrc/jit/operator.cpp
+++ b/torch/csrc/jit/operator.cpp
@@ -21,7 +21,7 @@ struct OperatorRegistry {
   std::mutex lock;
   OperatorMap operators;
   // list of operators whose schema have not yet been parsed, and must
-  // be registered before any call to lookup an opeator
+  // be registered before any call to lookup an operator
   std::vector<std::shared_ptr<Operator>> to_register;
   // Those two maps are used to implement lookupByLiteral, which is needed for
   // the n->match(...) calls. Basically, every function schema is assigned a
diff --git a/torch/csrc/jit/passes/alias_analysis.cpp b/torch/csrc/jit/passes/alias_analysis.cpp
index 443f19f6e45..4088e707c86 100644
--- a/torch/csrc/jit/passes/alias_analysis.cpp
+++ b/torch/csrc/jit/passes/alias_analysis.cpp
@@ -661,7 +661,7 @@ void AliasDb::analyzeWait(Node* node) {
 
 void AliasDb::analyzeTupleConstruct(Node* node) {
   // Because we currently mark all Tuples as needing annotation
-  // (even those containing just prmitive types), an element needs to be created
+  // (even those containing just primitive types), an element needs to be created
   // for TupleConstruct. When that changes we can create an element
   // only if it contains elements which need annotation
   getOrCreateElement(node->output());
diff --git a/torch/csrc/jit/passes/bailout_graph.cpp b/torch/csrc/jit/passes/bailout_graph.cpp
index 9110d456e1f..4931a25a1eb 100644
--- a/torch/csrc/jit/passes/bailout_graph.cpp
+++ b/torch/csrc/jit/passes/bailout_graph.cpp
@@ -290,7 +290,7 @@ struct BailOutInserter {
         const auto& live_inputs = liveness_sets_[*it];
 
         // guarded inputs come first
-        // currently, there's always one guaded input
+        // currently, there's always one guarded input
         bailout_node->addInput(it->input());
         for (auto li : live_inputs) {
           // Guarded inputs have already been added
diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp
index 8fda94cef77..fde8f565307 100644
--- a/torch/csrc/jit/passes/batch_mm.cpp
+++ b/torch/csrc/jit/passes/batch_mm.cpp
@@ -167,7 +167,7 @@ RegisterOperators mm_tree_reduction_reg({Operator(
 // TreeTokens will be used to label nodes of the graph, if the nodes will fit
 // our mm/add tree pattern. Basically we do dynamic programming on DAGs, where
 // when we reach node N with inputs A and B, then A and B have already been
-// procesed, and we can try to unify their TreeTokens (if they have them)
+// processed, and we can try to unify their TreeTokens (if they have them)
 // and build a larger tree.
 struct TreeToken {
   uint64_t tree_size = 0; // NOTE: measured in number of leaves i.e. mm ops
diff --git a/torch/csrc/jit/passes/guard_elimination.cpp b/torch/csrc/jit/passes/guard_elimination.cpp
index 783da320ff2..9d3747ddad2 100644
--- a/torch/csrc/jit/passes/guard_elimination.cpp
+++ b/torch/csrc/jit/passes/guard_elimination.cpp
@@ -158,7 +158,7 @@ struct GuardElimination {
   }
 
   // `checkInputs` check the invariants specified in `removableGuard`
-  // on inputs to `n`. The invarints must hold, or an input must
+  // on inputs to `n`. The invariants must hold, or an input must
   // be a `prim::Constant` or be of `NumberType` or be included
   // as an exception in `except`
   bool checkInputs(Node *n, const std::unordered_set<size_t> &except) {
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 4198d770d8e..35855596ddb 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -345,7 +345,7 @@ void BlockToONNX(
   // Finally, visit all nodes in the graph
   for (auto node : old_block->nodes()) {
     if (node->kind().is_caffe2()) {
-      // Pass on Caffe2 opeartor, since we already preprocess it
+      // Pass on Caffe2 operator, since we already preprocess it
       cloneNode(node);
     } else if (node->kind() == prim::PythonOp) {
       callPySymbolicMethod(static_cast<ConcretePythonOp*>(node));
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index ec8b8ffa0d2..e1b05fd824b 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -540,7 +540,7 @@ static void eraseListConstruct(Block* block, int opset_version) {
         TypePtr elem =
             lc_node->output()->type()->cast<ListType>()->getElementType();
         if (elem->cast<IntType>()) {
-          // ListConstruct Int[] output case, we need to transfrom to ONNX
+          // ListConstruct Int[] output case, we need to transform to ONNX
           // Concat to ensure the output is a single tensor(dynamic) type in
           // order to be consumed as inputs
           std::vector<Value*> unsqueezed;
diff --git a/torch/csrc/jit/passes/quantization.cpp b/torch/csrc/jit/passes/quantization.cpp
index 278da368f23..bf20313232a 100644
--- a/torch/csrc/jit/passes/quantization.cpp
+++ b/torch/csrc/jit/passes/quantization.cpp
@@ -700,7 +700,7 @@ void InsertQuantDeQuantHelper::removeObservers(
   // Remove observer modules from last one to first one in order to
   // reduce the time complexity, assuming all the observer modules
   // are added after the existing modules, we'll have complexity of
-  // O(N) where N is number of observer moduels with this optimization
+  // O(N) where N is number of observer modules with this optimization
   if (observer_modules_to_remove_.count(g)) {
     const auto& observers = observer_modules_to_remove_.at(g);
     for (int64_t i = observers.size() - 1; i >= 0; --i) {
diff --git a/torch/csrc/jit/passes/quantization.h b/torch/csrc/jit/passes/quantization.h
index db63e3aff49..91c9336481c 100644
--- a/torch/csrc/jit/passes/quantization.h
+++ b/torch/csrc/jit/passes/quantization.h
@@ -139,7 +139,7 @@ TORCH_API void FoldPrepackedWeightIntoModule(
     const script::Module& linear_params_module,
     const script::Module& conv_params_module);
 
-/** Recursivly deduplicate multiple uses of the same module by
+/** Recursively deduplicate multiple uses of the same module by
  *  creating an instance clone for each use of the module, which means
  *  the type will be the same as before and all the attributes will be
  *  copied, then we'll change the use of the original module to the use
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index e739c884649..4087837dd2f 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -727,7 +727,7 @@ class ShapePropagator {
   // a tensor, so we should special-case these ops in the shape propagation.
   // Additionally, passing in a zero representative tensor into an integer
   // division op causes divide-by-zero errors
-  // _Outputs_ must be tensors or primtives
+  // _Outputs_ must be tensors or primitives
   // We will call inferTypeFrom on the tensors, and ignore the primitives.
   // However, we allow primitive returns because we want to support mixed
   // primitive/tensor outputs.
diff --git a/torch/csrc/jit/script/concrete_module_type.h b/torch/csrc/jit/script/concrete_module_type.h
index 70769704c31..aa98ddff651 100644
--- a/torch/csrc/jit/script/concrete_module_type.h
+++ b/torch/csrc/jit/script/concrete_module_type.h
@@ -159,7 +159,7 @@ class VISIBILITY_HIDDEN ConcreteModuleTypeBuilder {
   // not first-class in the type system.
   std::unordered_map<std::string, FunctionAttribute> functionAttributes_;
   // Function attributes that are calls to builtin functions. These get de-sugared
-  // directly into the correpsonding aten:: call.
+  // directly into the corresponding aten:: call.
   // The map is attribute name -> aten symbol name
   std::unordered_map<std::string, c10::Symbol> builtinFunctions_;
   // The concrete types of any submodules
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index f1b2b5e5008..e21dca69e78 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -882,7 +882,7 @@ void initJitScriptBindings(PyObject* module) {
              const ExtraFilesMap& _extra_files = ExtraFilesMap()) {
             Module module("__torch__.PlaceholderModule");
             // [issue 27343]
-            // Modules have 'training' attributes by defualt, but due to
+            // Modules have 'training' attributes by default, but due to
             // https://github.com/pytorch/pytorch/issues/27343, functions end
             // up having a training attribute when they are loaded. This adds
             // a fake 'training' attribute that shouldn't be used, but prevents
diff --git a/torch/csrc/jit/script/lexer.h b/torch/csrc/jit/script/lexer.h
index ea428c76fe7..5bcb7bee7e1 100644
--- a/torch/csrc/jit/script/lexer.h
+++ b/torch/csrc/jit/script/lexer.h
@@ -510,7 +510,7 @@ struct Lexer {
   std::shared_ptr<Source> source;
   size_t pos;
   size_t nesting; // depth of ( [ { nesting...
-  std::vector<int> indent_stack; // stack of identation level of blocks
+  std::vector<int> indent_stack; // stack of indentation level of blocks
   // Invariant: this should always contain at least a single element
   std::vector<Token> next_tokens;
   SharedParserData& shared;
diff --git a/torch/csrc/jit/script/python_sugared_value.h b/torch/csrc/jit/script/python_sugared_value.h
index dc5d2a0920b..de889411edc 100644
--- a/torch/csrc/jit/script/python_sugared_value.h
+++ b/torch/csrc/jit/script/python_sugared_value.h
@@ -105,7 +105,7 @@ struct VISIBILITY_HIDDEN ConstantParameterList : public SugaredValue {
 // defines how modules/methods behave inside the script subset.
 // for now this does not have any interaction with python.
 // in the future, we will add the ability to resolve `self.foo` to python
-// {functions, modules, contants} so this SugaredValue is defined here
+// {functions, modules, constants} so this SugaredValue is defined here
 // anticipating we will eventually need to replace Module with a py::object
 // holding the actual nn.Module class.
 
diff --git a/torch/csrc/jit/script/schema_matching.cpp b/torch/csrc/jit/script/schema_matching.cpp
index 154b483929a..0d1065fa53c 100644
--- a/torch/csrc/jit/script/schema_matching.cpp
+++ b/torch/csrc/jit/script/schema_matching.cpp
@@ -49,7 +49,7 @@ inline bool convertibleToList(const TypePtr& type, const TypePtr& list_type_) {
   return false;
 }
 
-// Applies implict conversion from value trying to turn it into type
+// Applies implicit conversion from value trying to turn it into type
 // concrete_type. It succeeds if `return_value->isSubtypeOf(concrete_type)`
 Value* tryConvertToType(
     const SourceRange& loc,
diff --git a/torch/csrc/jit/script/schema_matching.h b/torch/csrc/jit/script/schema_matching.h
index 3eeff758a43..0adac966fe0 100644
--- a/torch/csrc/jit/script/schema_matching.h
+++ b/torch/csrc/jit/script/schema_matching.h
@@ -54,7 +54,7 @@ TORCH_API c10::optional<size_t> findInputWithName(
     const std::string& name,
     at::ArrayRef<NamedValue> kwargs);
 
-// applies implict conversion from value trying to turn it into type
+// applies implicit conversion from value trying to turn it into type
 // concrete_type it succeeds if the return_value->isSubtypeOf(concrete_type)
 TORCH_API Value* tryConvertToType(
     const SourceRange& loc,
diff --git a/torch/csrc/jit/script/string_to_type.cpp b/torch/csrc/jit/script/string_to_type.cpp
index f7d95844c07..14f66054ef8 100644
--- a/torch/csrc/jit/script/string_to_type.cpp
+++ b/torch/csrc/jit/script/string_to_type.cpp
@@ -13,7 +13,7 @@ const std::unordered_map<std::string, TypePtr>& string_to_type_lut() {
       {"str", StringType::get()},
       {"Device", DeviceObjType::get()},
       // technically this is not a python type but we need it when
-      // parsing serialized methods that use implicit converions to Scalar
+      // parsing serialized methods that use implicit conversions to Scalar
       {"number", NumberType::get()},
       {"None", NoneType::get()},
       {"Any", AnyType::get()},
diff --git a/torch/csrc/jit/unpickler.h b/torch/csrc/jit/unpickler.h
index 9f81b086d6c..fb20dc86018 100644
--- a/torch/csrc/jit/unpickler.h
+++ b/torch/csrc/jit/unpickler.h
@@ -52,7 +52,7 @@ class Unpickler {
   IValue parse_ivalue();
 
  private:
-  // No arguments ensures that a template arugment must be specified
+  // No arguments ensures that a template argument must be specified
   // so that the number of bytes read / type read is explicit
   template <typename T>
   T read() {
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index 3adb47f9f47..6a0188d95ce 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -84,7 +84,7 @@ static PyObject* Tensor_instancecheck(PyTensorType* self, PyObject* arg) {
     // against torch.cuda.FloatTensor, this will immediately initialize CUDA.
     // I originally thought that it would not be possible for aten_type_ to
     // be nullptr if you had a tensor of some type, in which case you can
-    // skip initializign aten_type(), but TestAutograd.test_type_conversions
+    // skip initializing aten_type(), but TestAutograd.test_type_conversions
     // seems to violate this property (for whatever reason.)
     //
     // TODO: Stop using legacyExtractDispatchKey here (probably need to build
diff --git a/torch/csrc/utils/throughput_benchmark-inl.h b/torch/csrc/utils/throughput_benchmark-inl.h
index e14c0410709..4b092123431 100644
--- a/torch/csrc/utils/throughput_benchmark-inl.h
+++ b/torch/csrc/utils/throughput_benchmark-inl.h
@@ -28,7 +28,7 @@ BenchmarkExecutionStats BenchmarkHelper<Input, Output, Model>::benchmark(
     std::mt19937 engine(seeder());
     TORCH_CHECK(
         !inputs_.empty(),
-        "Please provide benchmark inptus."
+        "Please provide benchmark inputs."
         "Did you forget to call add_input()? ");
     std::uniform_int_distribution<int> dist(0, inputs_.size() - 1);
 
diff --git a/torch/csrc/utils/variadic.h b/torch/csrc/utils/variadic.h
index 63f34afbc37..b02b7d29f7f 100644
--- a/torch/csrc/utils/variadic.h
+++ b/torch/csrc/utils/variadic.h
@@ -52,7 +52,7 @@ template <size_t... Is>
 struct Indices {};
 
 // Decrements the index N, adds N-1 to the list of indices and forwards
-// whatever we arleady have.
+// whatever we already have.
 template <size_t N, size_t... Is>
 struct MakeIndices : MakeIndices<N - 1, N - 1, Is...> {};
 
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index bf437263195..90eae50e775 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -185,7 +185,7 @@ def _get_global_rank(group, group_rank):
 
 def _check_default_pg():
     """
-    Helper that checks if the default ProcessGroup has been initializd, with
+    Helper that checks if the default ProcessGroup has been initialized, with
     assertion
 
     """
diff --git a/torch/distributed/launch.py b/torch/distributed/launch.py
index eee8b76c2e0..9563fa629f8 100644
--- a/torch/distributed/launch.py
+++ b/torch/distributed/launch.py
@@ -63,7 +63,7 @@ Node 2:
 
 **Important Notices:**
 
-1. This utilty and multi-process distributed (single-node or
+1. This utility and multi-process distributed (single-node or
 multi-node) GPU training currently only achieves the best performance using
 the NCCL distributed backend. Thus NCCL backend is the recommended backend to
 use for GPU training.
@@ -152,7 +152,7 @@ def parse_args():
     @retval ArgumentParser
     """
     parser = ArgumentParser(description="PyTorch distributed training launch "
-                                        "helper utilty that will spawn up "
+                                        "helper utility that will spawn up "
                                         "multiple distributed processes")
 
     # Optional arguments for the launch helper
diff --git a/torch/distributed/rpc/__init__.py b/torch/distributed/rpc/__init__.py
index 6f7455e1b0a..6f5a093d464 100644
--- a/torch/distributed/rpc/__init__.py
+++ b/torch/distributed/rpc/__init__.py
@@ -49,7 +49,7 @@ if is_available():
             rank (int): a globally unique id/rank of this node.
             world_size (int): The number of workers in the group.
             rpc_backend_options (RpcBackendOptions): The options passed to
-                RpcAgent consturctor. It contains RpcAgent specific
+                RpcAgent constructor. It contains RpcAgent specific
                 initialization configurations. By default, it contains
                 ``rpc_timeout = timedelta(seconds=60)``,
                 ``init_method = "env://"``, ``num_send_recv_threads = 4`` for
diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py
index 78e7e504566..20bc96a1121 100644
--- a/torch/distributions/transforms.py
+++ b/torch/distributions/transforms.py
@@ -32,7 +32,7 @@ class Transform(object):
     det jacobians. They are primarily used in
     :class:`torch.distributions.TransformedDistribution`.
 
-    Caching is useful for tranforms whose inverses are either expensive or
+    Caching is useful for transforms whose inverses are either expensive or
     numerically unstable. Note that care must be taken with memoized values
     since the autograd graph may be reversed. For example while the following
     works with or without caching::
diff --git a/torch/hub.py b/torch/hub.py
index 73d97b1a7a9..0c38769c246 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -399,7 +399,7 @@ def download_url_to_file(url, dst, hash_prefix=None, progress=True):
 
     # We deliberately save it in a temp file and move it after
     # download is complete. This prevents a local working checkpoint
-    # being overriden by a broken download.
+    # being overridden by a broken download.
     dst = os.path.expanduser(dst)
     dst_dir = os.path.dirname(dst)
     f = tempfile.NamedTemporaryFile(delete=False, dir=dst_dir)
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
index e881a945cf0..14840394e34 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -27,7 +27,7 @@ constexpr const char* NCCL_BLOCKING_WAIT = "NCCL_BLOCKING_WAIT";
 // specifically, each NCCL call is scheduled on a separate CUDA stream that is
 // different from the current CUDA stream. This is for the purpose of
 // achieving potentially concurrency and better performance. As a result,
-// it is the callers' responsibilty to make sure that the CUDA stream their
+// it is the callers' responsibility to make sure that the CUDA stream their
 // code works on needs to wait for the NCCL operation from
 // this class.
 //
diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py
index 194b37660e6..87d4dbedb26 100644
--- a/torch/multiprocessing/reductions.py
+++ b/torch/multiprocessing/reductions.py
@@ -175,7 +175,7 @@ def reduce_tensor(tensor):
     # the old ones alives.
     # See [https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html]
     #
-    # This is fine, because all we need to do is to save our position in the allocaiton,
+    # This is fine, because all we need to do is to save our position in the allocation,
     # and reconstruct storage and tensor from it.
     # 0xA000 ->  -------CUDA Allocation------
     #           |                            |
diff --git a/torch/nn/modules/_functions.py b/torch/nn/modules/_functions.py
index f0f23953c00..3034e8e8969 100644
--- a/torch/nn/modules/_functions.py
+++ b/torch/nn/modules/_functions.py
@@ -34,7 +34,7 @@ class SyncBatchNorm(Function):
         mean_all_reduce.wait()
         invstd_all_reduce.wait()
 
-        # calcualte global mean & invstd
+        # calculate global mean & invstd
         mean, invstd = torch.batch_norm_gather_stats_with_counts(
             input,
             mean_all,
diff --git a/torch/nn/modules/adaptive.py b/torch/nn/modules/adaptive.py
index cd1e9e9a7c9..b4087a5f27b 100644
--- a/torch/nn/modules/adaptive.py
+++ b/torch/nn/modules/adaptive.py
@@ -59,7 +59,7 @@ class AdaptiveLogSoftmaxWithLoss(Module):
       implementation.
 
     .. warning::
-        Labels passed as inputs to this module should be sorted accoridng to
+        Labels passed as inputs to this module should be sorted according to
         their frequency. This means that the most frequent label should be
         represented by the index `0`, and the least frequent
         label should be represented by the index `n_classes - 1`.
diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py
index 79fc6a3de4b..8037d246927 100644
--- a/torch/nn/modules/instancenorm.py
+++ b/torch/nn/modules/instancenorm.py
@@ -83,7 +83,7 @@ class InstanceNorm1d(_InstanceNorm):
         have some subtle differences. :class:`InstanceNorm1d` is applied
         on each channel of channeled data like multidimensional time series, but
         :class:`LayerNorm` is usually applied on entire sample and often in NLP
-        tasks. Additionaly, :class:`LayerNorm` applies elementwise affine
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
         transform, while :class:`InstanceNorm1d` usually don't apply affine
         transform.
 
@@ -164,7 +164,7 @@ class InstanceNorm2d(_InstanceNorm):
         have some subtle differences. :class:`InstanceNorm2d` is applied
         on each channel of channeled data like RGB images, but
         :class:`LayerNorm` is usually applied on entire sample and often in NLP
-        tasks. Additionaly, :class:`LayerNorm` applies elementwise affine
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
         transform, while :class:`InstanceNorm2d` usually don't apply affine
         transform.
 
@@ -238,7 +238,7 @@ class InstanceNorm3d(_InstanceNorm):
         have some subtle differences. :class:`InstanceNorm3d` is applied
         on each channel of channeled data like 3D models with RGB color, but
         :class:`LayerNorm` is usually applied on entire sample and often in NLP
-        tasks. Additionaly, :class:`LayerNorm` applies elementwise affine
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
         transform, while :class:`InstanceNorm3d` usually don't apply affine
         transform.
 
diff --git a/torch/nn/utils/prune.py b/torch/nn/utils/prune.py
index 83ac0a6d065..2eff648c028 100644
--- a/torch/nn/utils/prune.py
+++ b/torch/nn/utils/prune.py
@@ -169,7 +169,7 @@ class BasePruningMethod(ABC):
             del module._parameters[name]
             default_mask = torch.ones_like(orig)  # temp
         # If this is not the first time pruning is applied, all of the above
-        # has been done before in a previos pruning iteration, so we're good
+        # has been done before in a previous pruning iteration, so we're good
         # to go
         else:
             default_mask = getattr(module, name + "_mask").detach().clone(memory_format=torch.contiguous_format)
diff --git a/torch/onnx/operators.py b/torch/onnx/operators.py
index fcf84a73b2d..88c40e270e1 100644
--- a/torch/onnx/operators.py
+++ b/torch/onnx/operators.py
@@ -1,6 +1,6 @@
 r"""This file provides a location for operators that help exporting
 models via onnx. E.g. shape_as_tensor and reshape_from_tensor_shape
-are to make all dynamic sizes operations traceble.
+are to make all dynamic sizes operations traceable.
 
 NOTE: at one point these functions were implemented differently.
 Since then we have implemented these directly in ATen, so this
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index bfedab7eaaf..7cb7414ce61 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -115,7 +115,7 @@ def checkpoint(function, *args, **kwargs):
     :func:`torch.no_grad` manner, i.e., not storing the intermediate
     activations. Instead, the forward pass saves the inputs tuple and the
     :attr:`function` parameter. In the backwards pass, the saved inputs and
-    :attr:`function` is retreived, and the forward pass is computed on
+    :attr:`function` is retrieved, and the forward pass is computed on
     :attr:`function` again, now tracking the intermediate activations, and then
     the gradients are calculated using these activation values.
 
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index b91b16e20c9..aed88c402c9 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -537,7 +537,7 @@ def include_paths(cuda=False):
     if cuda:
         cuda_home_include = _join_cuda_home('include')
         # if we have the Debian/Ubuntu packages for cuda, we get /usr as cuda home.
-        # but gcc dosn't like having /usr/include passed explicitly
+        # but gcc doesn't like having /usr/include passed explicitly
         if cuda_home_include != '/usr/include':
             paths.append(cuda_home_include)
         if CUDNN_HOME is not None:
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index 4b6568f081a..bd994d5e7a0 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -111,7 +111,7 @@ def _worker_loop(dataset_kind, dataset, index_queue, data_queue, done_event,
     # logic of this function.
 
     try:
-        # Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
+        # Initialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
         # module's handlers are executed after Python returns from C low-level
         # handlers, likely when the same fatal signal had already happened
         # again.
diff --git a/torch/utils/file_baton.py b/torch/utils/file_baton.py
index c4eea9fb382..143d2935aa8 100644
--- a/torch/utils/file_baton.py
+++ b/torch/utils/file_baton.py
@@ -49,6 +49,6 @@ class FileBaton:
             time.sleep(self.wait_seconds)
 
     def release(self):
-        '''Releaes the baton and removes its file.'''
+        '''Releases the baton and removes its file.'''
         os.close(self.fd)
         os.remove(self.lock_file_path)