Updates to the release notes scripts and documentation (#94560)

# Summary
This PR made some significant changes to the scripts around Release Scripts. At a high level:
- Turned the quips into docs and updated links
- Update the common.categorizes list in the hopes to make this the source of truth for releases- This is hard since the release_notes labels can be changed at will. An alternative would be to poll from github api. However, I think that is overkill. The notebook does a set compare and will show you knew categories. I think we want this to be manual so that the release note engineer will decided how to categorize.
- Create cateogry group from speaking with folks on distributed and AO that told me these different release categories can be merged.
- I am the newest person to Core and don't use ghstack soo made token getting a lil more generic.
- Added a classifier.py file. This file will train a commit categorizer for you, hopefully with decent accuracy. I was able to achieve 75% accuracy. I drop the highest frequency class - "skip" since this creates a more useful cateogrizer.
- I updated the categorize.py script so that the prompt will be what the classifier thinks, gated by a flag.
- Added a readme that will hopefully help future release notes engineers.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/94560
Approved by: https://github.com/albanD
This commit is contained in:
Driss Guessous
2023-03-16 00:09:26 +00:00
committed by PyTorch MergeBot
parent 731bb6e61b
commit dcafe3f271
7 changed files with 885 additions and 42 deletions

View File

@@ -1,10 +1,11 @@
import argparse
from common import run, topics, get_features
from common import run, topics, get_features, frontend_categories
from collections import defaultdict
import os
from pathlib import Path
import csv
import pprint
import common
from common import get_commit_data_cache, features_to_dict
import re
import dataclasses
@@ -30,6 +31,7 @@ class Commit:
category: str
topic: str
title: str
files_changed: str
pr_link: str
author: str
@@ -86,7 +88,7 @@ class CommitList:
writer.writerow(commit_fields)
for commit in commit_list:
writer.writerow(dataclasses.astuple(commit))
@staticmethod
def keywordInFile(file, keywords):
for key in keywords:
if key in file:
@@ -103,8 +105,56 @@ class CommitList:
pr_link = f"https://github.com/pytorch/pytorch/pull/{features['pr_number']}"
else:
pr_link = None
files_changed_str = ' '.join(features['files_changed'])
return Commit(commit_hash, category, topic, features["title"], files_changed_str, pr_link, features["author"], a1, a2, a3)
return Commit(commit_hash, category, topic, features["title"], pr_link, features["author"], a1, a2, a3)
@staticmethod
def category_remapper(category: str) -> str:
if category in frontend_categories:
category = category + '_frontend'
return category
if category == 'Meta API':
category = 'composability'
return category
if category in common.quantization.categories:
category = common.quantization.name
return category
if category in common.distributed.categories:
category = common.distributed.name
return category
return category
@staticmethod
def bracket_category_matcher(title: str):
"""Categorize a commit based on the presence of a bracketed category in the title.
Args:
title (str): title to seaarch
Returns:
optional[str]
"""
pairs = [
('[dynamo]', 'dynamo'),
('[torchdynamo]', 'dynamo'),
('[torchinductor]', 'inductor'),
('[inductor]', 'inductor'),
('[codemod', 'skip'),
('[profiler]', 'profiler'),
('[functorch]', 'functorch'),
('[autograd]', 'autograd_frontend'),
('[quantization]', 'quantization'),
('[nn]', 'nn_frontend'),
('[complex]', 'complex_frontend'),
('[mps]', 'mps'),
('[optimizer]', 'optimizer_frontend'),
('[xla]', 'xla'),
]
title_lower = title.lower()
for bracket, category in pairs:
if bracket in title_lower:
return category
return None
@staticmethod
def categorize(features):
@@ -113,6 +163,10 @@ class CommitList:
category = 'Uncategorized'
topic = 'Untopiced'
# Revert commits are merged directly to master with no associated PR number
if features['pr_number'] is None:
if title.startswith("Revert"):
return 'skip', topic
# We ask contributors to label their PR's appropriately
# when they're first landed.
@@ -121,6 +175,7 @@ class CommitList:
for label in labels:
if label.startswith('release notes: '):
category = label.split('release notes: ', 1)[1]
category = CommitList.category_remapper(category)
already_categorized = True
if label.startswith('topic: '):
topic = label.split('topic: ', 1)[1]
@@ -131,8 +186,6 @@ class CommitList:
# update this to check if each file starts with caffe2
if 'caffe2' in title:
return 'caffe2', topic
if '[codemod]' in title.lower():
return 'skip', topic
if 'Reverted' in labels:
return 'skip', topic
if 'bc_breaking' in labels:
@@ -140,6 +193,10 @@ class CommitList:
if 'module: deprecation' in labels:
topic = 'deprecation'
found_bracket_category = CommitList.bracket_category_matcher(title)
if found_bracket_category:
return found_bracket_category, topic
files_changed = features['files_changed']
for file in files_changed:
file_lowercase = file.lower()
@@ -169,11 +226,11 @@ class CommitList:
category = 'fx'
break
if CommitList.keywordInFile(file, ['torch/ao', 'test/ao']):
category = 'ao'
category = common.quantization.name
break
# torch/quantization, test/quantization, aten/src/ATen/native/quantized, torch/nn/{quantized, quantizable}
if CommitList.keywordInFile(file, ['torch/quantization', 'test/quantization', 'aten/src/ATen/native/quantized', 'torch/nn/quantiz']):
category = 'quantization'
category = common.quantization.name
break
if CommitList.keywordInFile(file, ['torch/package', 'test/package']):
category = 'package'
@@ -196,6 +253,15 @@ class CommitList:
if CommitList.keywordInFile(file, ['torch/csrc/jit', 'torch/jit']):
category = 'jit'
break
if CommitList.keywordInFile(file, ['torch/_meta_registrations.py', 'torch/_decomp', 'torch/_prims', 'torch/_refs']):
category = 'composability'
break
if CommitList.keywordInFile(file, ['torch/_dynamo']):
category = 'dynamo'
break
if CommitList.keywordInFile(file, ['torch/_inductor']):
category = 'inductor'
break
else:
# Below are some extra quick checks that aren't necessarily file-path related,
# but I found that to catch a decent number of extra commits.
@@ -210,6 +276,9 @@ class CommitList:
# individual torch_docs changes are usually for python ops
category = 'python_frontend'
# If we couldn't find a category but the topic is not user facing we can skip these:
if category == "Uncategorized" and topic == "not user facing":
category = "skip"
return category, topic
@@ -260,13 +329,13 @@ def update_existing(path, new_version):
def rerun_with_new_filters(path):
current_commits = CommitList.from_existing(path)
for i in range(len(current_commits.commits)):
c = current_commits.commits[i]
if 'Uncategorized' in str(c):
feature_item = get_commit_data_cache().get(c.commit_hash)
for i, commit in enumerate(current_commits.commits):
current_category = commit.category
if current_category == 'Uncategorized' or current_category not in common.categories:
feature_item = get_commit_data_cache().get(commit.commit_hash)
features = features_to_dict(feature_item)
category, topic = CommitList.categorize(features)
current_commits[i] = dataclasses.replace(c, category=category, topic=topic)
current_commits.commits[i] = dataclasses.replace(commit, category=category, topic=topic)
current_commits.write_result()
def get_hash_or_pr_url(commit: Commit):
@@ -318,14 +387,14 @@ def get_markdown_header(category):
The main goal of this process is to rephrase all the commit messages below to make them clear and easy to read by the end user. You should follow the following instructions to do so:
* **Please cleanup, and format commit titles to be readable by the general pytorch user.** [Detailed intructions here](https://fb.quip.com/OCRoAbEvrRD9#HdaACARZZvo)
* **Please cleanup, and format commit titles to be readable by the general pytorch user.** [Detailed instructions here](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit)
* Please sort commits into the following categories (you should not rename the categories!), I tried to pre-sort these to ease your work, feel free to move commits around if the current categorization is not good.
* Please drop any commits that are not user-facing.
* If anything is from another domain, leave it in the UNTOPICED section at the end and I'll come and take care of it.
The categories below are as follows:
* BC breaking: All commits that are BC-breaking. These are the most important commits. If any pre-sorted commit is actually BC-breaking, do move it to this section. Each commit should contain a paragraph explaining the rational behind the change as well as an example for how to update user code (guidelines here: https://quip.com/OCRoAbEvrRD9)
* BC breaking: All commits that are BC-breaking. These are the most important commits. If any pre-sorted commit is actually BC-breaking, do move it to this section. Each commit should contain a paragraph explaining the rational behind the change as well as an example for how to update user code [BC-Guidelines](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit#heading=h.a9htwgvvec1m).
* Deprecations: All commits introducing deprecation. Each commit should include a small example explaining what should be done to update user code.
* new_features: All commits introducing a new feature (new functions, new submodule, new supported platform etc)
* improvements: All commits providing improvements to existing feature should be here (new backend for a function, new argument, better numerical stability)
@@ -357,6 +426,7 @@ def main():
if args.create_new:
create_new(args.path, args.create_new[0], args.create_new[1])
print("Finished creating new commit list. Results have been saved to results/commitlist.csv")
return
if args.update_to:
update_existing(args.path, args.update_to)