Updates to the release notes scripts and documentation (#94560)

# Summary This PR made some significant changes to the scripts around Release Scripts. At a high level: - Turned the quips into docs and updated links - Update the common.categorizes list in the hopes to make this the source of truth for releases- This is hard since the release_notes labels can be changed at will. An alternative would be to poll from github api. However, I think that is overkill. The notebook does a set compare and will show you knew categories. I think we want this to be manual so that the release note engineer will decided how to categorize. - Create cateogry group from speaking with folks on distributed and AO that told me these different release categories can be merged. - I am the newest person to Core and don't use ghstack soo made token getting a lil more generic. - Added a classifier.py file. This file will train a commit categorizer for you, hopefully with decent accuracy. I was able to achieve 75% accuracy. I drop the highest frequency class - "skip" since this creates a more useful cateogrizer. - I updated the categorize.py script so that the prompt will be what the classifier thinks, gated by a flag. - Added a readme that will hopefully help future release notes engineers. Pull Request resolved: https://github.com/pytorch/pytorch/pull/94560 Approved by: https://github.com/albanD
2026-01-15 12:15:51 +00:00 · 2023-03-16 00:09:26 +00:00
parent 731bb6e61b
commit dcafe3f271
7 changed files with 885 additions and 42 deletions
--- a/scripts/release_notes/commitlist.py
+++ b/scripts/release_notes/commitlist.py
@@ -1,10 +1,11 @@
 import argparse
-from common import run, topics, get_features
+from common import run, topics, get_features, frontend_categories
 from collections import defaultdict
 import os
 from pathlib import Path
 import csv
 import pprint
+import common
 from common import get_commit_data_cache, features_to_dict
 import re
 import dataclasses
@@ -30,6 +31,7 @@ class Commit:
    category: str
    topic: str
    title: str
+    files_changed: str
    pr_link: str
    author: str

@@ -86,7 +88,7 @@ class CommitList:
            writer.writerow(commit_fields)
            for commit in commit_list:
                writer.writerow(dataclasses.astuple(commit))
-
+    @staticmethod
    def keywordInFile(file, keywords):
        for key in keywords:
            if key in file:
@@ -103,8 +105,56 @@ class CommitList:
            pr_link = f"https://github.com/pytorch/pytorch/pull/{features['pr_number']}"
        else:
            pr_link = None
+        files_changed_str = ' '.join(features['files_changed'])
+        return Commit(commit_hash, category, topic, features["title"], files_changed_str,  pr_link, features["author"], a1, a2, a3)

-        return Commit(commit_hash, category, topic, features["title"], pr_link, features["author"], a1, a2, a3)
+    @staticmethod
+    def category_remapper(category: str) -> str:
+        if category in frontend_categories:
+            category = category + '_frontend'
+            return category
+        if category == 'Meta API':
+            category = 'composability'
+            return category
+        if category in common.quantization.categories:
+            category = common.quantization.name
+            return category
+        if category in common.distributed.categories:
+            category = common.distributed.name
+            return category
+        return category
+
+    @staticmethod
+    def bracket_category_matcher(title: str):
+        """Categorize a commit based on the presence of a bracketed category in the title.
+
+        Args:
+            title (str): title to seaarch
+
+        Returns:
+            optional[str]
+        """
+        pairs = [
+            ('[dynamo]', 'dynamo'),
+            ('[torchdynamo]', 'dynamo'),
+            ('[torchinductor]', 'inductor'),
+            ('[inductor]', 'inductor'),
+            ('[codemod', 'skip'),
+            ('[profiler]', 'profiler'),
+            ('[functorch]', 'functorch'),
+            ('[autograd]', 'autograd_frontend'),
+            ('[quantization]', 'quantization'),
+            ('[nn]', 'nn_frontend'),
+            ('[complex]', 'complex_frontend'),
+            ('[mps]', 'mps'),
+            ('[optimizer]', 'optimizer_frontend'),
+            ('[xla]', 'xla'),
+        ]
+        title_lower = title.lower()
+        for bracket, category in pairs:
+            if bracket in title_lower:
+                return category
+        return None

    @staticmethod
    def categorize(features):
@@ -113,6 +163,10 @@ class CommitList:
        category = 'Uncategorized'
        topic = 'Untopiced'

+        # Revert commits are merged directly to master with no associated PR number
+        if features['pr_number'] is None:
+            if title.startswith("Revert"):
+                return 'skip', topic

        # We ask contributors to label their PR's appropriately
        # when they're first landed.
@@ -121,6 +175,7 @@ class CommitList:
        for label in labels:
            if label.startswith('release notes: '):
                category = label.split('release notes: ', 1)[1]
+                category = CommitList.category_remapper(category)
                already_categorized = True
            if label.startswith('topic: '):
                topic = label.split('topic: ', 1)[1]
@@ -131,8 +186,6 @@ class CommitList:
        # update this to check if each file starts with caffe2
        if 'caffe2' in title:
            return 'caffe2', topic
-        if '[codemod]' in title.lower():
-            return 'skip', topic
        if 'Reverted' in labels:
            return 'skip', topic
        if 'bc_breaking' in labels:
@@ -140,6 +193,10 @@ class CommitList:
        if 'module: deprecation' in labels:
            topic = 'deprecation'

+        found_bracket_category = CommitList.bracket_category_matcher(title)
+        if found_bracket_category:
+            return found_bracket_category, topic
+
        files_changed = features['files_changed']
        for file in files_changed:
            file_lowercase = file.lower()
@@ -169,11 +226,11 @@ class CommitList:
                category = 'fx'
                break
            if CommitList.keywordInFile(file, ['torch/ao', 'test/ao']):
-                category = 'ao'
+                category = common.quantization.name
                break
            # torch/quantization, test/quantization, aten/src/ATen/native/quantized, torch/nn/{quantized, quantizable}
            if CommitList.keywordInFile(file, ['torch/quantization', 'test/quantization', 'aten/src/ATen/native/quantized', 'torch/nn/quantiz']):
-                category = 'quantization'
+                category = common.quantization.name
                break
            if CommitList.keywordInFile(file, ['torch/package', 'test/package']):
                category = 'package'
@@ -196,6 +253,15 @@ class CommitList:
            if CommitList.keywordInFile(file, ['torch/csrc/jit', 'torch/jit']):
                category = 'jit'
                break
+            if CommitList.keywordInFile(file, ['torch/_meta_registrations.py', 'torch/_decomp', 'torch/_prims', 'torch/_refs']):
+                category = 'composability'
+                break
+            if CommitList.keywordInFile(file, ['torch/_dynamo']):
+                category = 'dynamo'
+                break
+            if CommitList.keywordInFile(file, ['torch/_inductor']):
+                category = 'inductor'
+                break
        else:
            # Below are some extra quick checks that aren't necessarily file-path related,
            # but I found that to catch a decent number of extra commits.
@@ -210,6 +276,9 @@ class CommitList:
                # individual torch_docs changes are usually for python ops
                category = 'python_frontend'

+        # If we couldn't find a category but the topic is not user facing we can skip these:
+        if category == "Uncategorized" and topic == "not user facing":
+            category = "skip"

        return category, topic

@@ -260,13 +329,13 @@ def update_existing(path, new_version):

 def rerun_with_new_filters(path):
    current_commits = CommitList.from_existing(path)
-    for i in range(len(current_commits.commits)):
-        c = current_commits.commits[i]
-        if 'Uncategorized' in str(c):
-            feature_item = get_commit_data_cache().get(c.commit_hash)
+    for i, commit in enumerate(current_commits.commits):
+        current_category = commit.category
+        if current_category == 'Uncategorized' or current_category not in common.categories:
+            feature_item = get_commit_data_cache().get(commit.commit_hash)
            features = features_to_dict(feature_item)
            category, topic = CommitList.categorize(features)
-            current_commits[i] = dataclasses.replace(c, category=category, topic=topic)
+            current_commits.commits[i] = dataclasses.replace(commit, category=category, topic=topic)
    current_commits.write_result()

 def get_hash_or_pr_url(commit: Commit):
@@ -318,14 +387,14 @@ def get_markdown_header(category):

 The main goal of this process is to rephrase all the commit messages below to make them clear and easy to read by the end user. You should follow the following instructions to do so:

-* **Please cleanup, and format commit titles to be readable by the general pytorch user.** [Detailed intructions here](https://fb.quip.com/OCRoAbEvrRD9#HdaACARZZvo)
+* **Please cleanup, and format commit titles to be readable by the general pytorch user.** [Detailed instructions here](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit)
 * Please sort commits into the following categories (you should not rename the categories!), I tried to pre-sort these to ease your work, feel free to move commits around if the current categorization is not good.
 * Please drop any commits that are not user-facing.
 * If anything is from another domain, leave it in the UNTOPICED section at the end and I'll come and take care of it.

 The categories below are as follows:

-* BC breaking: All commits that are BC-breaking. These are the most important commits. If any pre-sorted commit is actually BC-breaking, do move it to this section. Each commit should contain a paragraph explaining the rational behind the change as well as an example for how to update user code (guidelines here: https://quip.com/OCRoAbEvrRD9)
+* BC breaking: All commits that are BC-breaking. These are the most important commits. If any pre-sorted commit is actually BC-breaking, do move it to this section. Each commit should contain a paragraph explaining the rational behind the change as well as an example for how to update user code [BC-Guidelines](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit#heading=h.a9htwgvvec1m).
 * Deprecations: All commits introducing deprecation. Each commit should include a small example explaining what should be done to update user code.
 * new_features: All commits introducing a new feature (new functions, new submodule, new supported platform etc)
 * improvements: All commits providing improvements to existing feature should be here (new backend for a function, new argument, better numerical stability)
@@ -357,6 +426,7 @@ def main():

    if args.create_new:
        create_new(args.path, args.create_new[0], args.create_new[1])
+        print("Finished creating new commit list. Results have been saved to results/commitlist.csv")
        return
    if args.update_to:
        update_existing(args.path, args.update_to)