From 374e4713d866b39cbb5474c7a38dd40b2e6f8b7d Mon Sep 17 00:00:00 2001 From: Coco Sheng Date: Wed, 6 May 2026 16:31:49 -0400 Subject: [PATCH] feat: add categorize_issues.py to automatically classify and update github issues, and update README --- scripts/backlog-analysis/README.md | 8 +- scripts/backlog-analysis/categorize_issues.py | 104 ++++++++++++++++++ 2 files changed, 109 insertions(+), 3 deletions(-) create mode 100644 scripts/backlog-analysis/categorize_issues.py diff --git a/scripts/backlog-analysis/README.md b/scripts/backlog-analysis/README.md index a5104326ff..757e31db1e 100644 --- a/scripts/backlog-analysis/README.md +++ b/scripts/backlog-analysis/README.md @@ -22,11 +22,13 @@ If you have a raw list of uncategorized issues, the first step is to apply the correct types (`bug` or `feature`) directly on GitHub, and then fetch the data into a local JSON file for analysis. -**A) Auto-Categorize on GitHub:** Use the Gemini CLI directly in your terminal -to classify and label the issues on GitHub. +**A) Auto-Categorize on GitHub:** We provide a dedicated Python script that will +automatically fetch uncategorized issues matching your search query, classify +them using the Gemini API, and apply the correct labels and title prefixes +(`[Bug]` or `[Feature]`) directly on GitHub. ```bash -gemini "I have a list of issues (e.g. 123, 124). For each issue, determine if it is a bug or a feature request. Use the gh CLI tool to act on the GitHub issue: (a) Add the 'type/bug' or 'type/feature' label, and (b) Edit the issue body or title to explicitly denote the type." +python3 categorize_issues.py --api-key "YOUR_KEY" --search "repo:google-gemini/gemini-cli -label:type/bug -label:type/feature is:open" --limit 50 ``` **B) Export to JSON:** Once the issues are correctly labeled on GitHub, fetch diff --git a/scripts/backlog-analysis/categorize_issues.py b/scripts/backlog-analysis/categorize_issues.py new file mode 100644 index 0000000000..94f049e04e --- /dev/null +++ b/scripts/backlog-analysis/categorize_issues.py @@ -0,0 +1,104 @@ +""" +Purpose: Automatically categorizes GitHub issues as 'bug' or 'feature' and applies the corresponding label on GitHub. +It fetches issues matching a search query, uses the Gemini API to classify them, and runs 'gh issue edit' to update GitHub. +""" +import argparse +import urllib.request +import json +import subprocess +import sys +import concurrent.futures + +MODEL = "gemini-3-flash-preview" + +def fetch_issues(search_query, limit): + cmd = ["gh", "issue", "list", "--search", search_query, "--limit", str(limit), "--json", "number,title,body,url"] + try: + res = subprocess.run(cmd, capture_output=True, text=True, check=True) + return json.loads(res.stdout) + except subprocess.CalledProcessError as e: + print(f"Error fetching issues: {e.stderr}", file=sys.stderr) + sys.exit(1) + +def categorize_issue(issue, api_key): + url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={api_key}" + prompt = f""" +Analyze the following GitHub issue and determine if it is a bug or a feature request. +Reply ONLY with a valid JSON object matching exactly this schema, without Markdown formatting: +{{"type": "bug" | "feature", "reasoning": "brief justification"}} + +Issue Title: {issue.get('title')} +Issue Body: {issue.get('body', '')[:1500]} +""" + data = { + "contents": [{"parts": [{"text": prompt}]}], + "generationConfig": {"temperature": 0.1} + } + req = urllib.request.Request(url, data=json.dumps(data).encode('utf-8'), headers={'Content-Type': 'application/json'}) + try: + with urllib.request.urlopen(req) as response: + result = json.loads(response.read().decode('utf-8')) + text = result['candidates'][0]['content']['parts'][0]['text'] + + # Clean markdown block if present + if text.startswith('```json'): + text = text[7:] + if text.startswith('```'): + text = text[3:] + if text.endswith('```'): + text = text[:-3] + + parsed = json.loads(text.strip()) + return parsed + except Exception as e: + print(f"Error processing issue {issue['number']}: {e}") + return None + +def process_issue(issue, api_key): + print(f"Categorizing Issue #{issue['number']}...") + result = categorize_issue(issue, api_key) + if not result or 'type' not in result: + print(f"Failed to categorize #{issue['number']}.") + return + + issue_type = result['type'] + label = f"type/{issue_type}" + print(f"Issue #{issue['number']} is a {issue_type}. Applying label '{label}' on GitHub...") + + cmd = ["gh", "issue", "edit", str(issue['number']), "--add-label", label] + + # Prepend the type to the title if it's not already there + title = issue.get('title', '') + if not title.lower().startswith(f"[{issue_type}]") and not title.lower().startswith(f"{issue_type}:"): + new_title = f"[{issue_type.capitalize()}] {title}" + cmd.extend(["--title", new_title]) + + try: + subprocess.run(cmd, capture_output=True, text=True, check=True) + print(f"Successfully labeled and updated title for #{issue['number']}.") + except subprocess.CalledProcessError as e: + print(f"Error labeling #{issue['number']}: {e.stderr}", file=sys.stderr) + +def main(): + parser = argparse.ArgumentParser(description="Auto-categorize GitHub issues (bug vs feature) and apply labels on GitHub.") + parser.add_argument("--api-key", required=True, help="Gemini API Key") + parser.add_argument("--search", required=True, help="GitHub search query for issues to categorize") + parser.add_argument("--limit", type=int, default=50, help="Maximum number of issues to process") + args = parser.parse_args() + + print(f"Fetching issues matching: '{args.search}'") + issues = fetch_issues(args.search, args.limit) + if not issues: + print("No issues found matching the query.") + return + + print(f"Found {len(issues)} issues to categorize.") + + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [executor.submit(process_issue, issue, args.api_key) for issue in issues] + concurrent.futures.wait(futures) + + print("Done categorizing issues.") + +if __name__ == '__main__': + main()