From 374e4713d866b39cbb5474c7a38dd40b2e6f8b7d Mon Sep 17 00:00:00 2001
From: Coco Sheng <cocosheng@google.com>
Date: Wed, 6 May 2026 16:31:49 -0400
Subject: [PATCH] feat: add categorize_issues.py to automatically classify and
 update github issues, and update README

---
 scripts/backlog-analysis/README.md            |   8 +-
 scripts/backlog-analysis/categorize_issues.py | 104 ++++++++++++++++++
 2 files changed, 109 insertions(+), 3 deletions(-)
 create mode 100644 scripts/backlog-analysis/categorize_issues.py

diff --git a/scripts/backlog-analysis/README.md b/scripts/backlog-analysis/README.md
index a5104326ff..757e31db1e 100644
--- a/scripts/backlog-analysis/README.md
+++ b/scripts/backlog-analysis/README.md
@@ -22,11 +22,13 @@ If you have a raw list of uncategorized issues, the first step is to apply the
 correct types (`bug` or `feature`) directly on GitHub, and then fetch the data
 into a local JSON file for analysis.
 
-**A) Auto-Categorize on GitHub:** Use the Gemini CLI directly in your terminal
-to classify and label the issues on GitHub.
+**A) Auto-Categorize on GitHub:** We provide a dedicated Python script that will
+automatically fetch uncategorized issues matching your search query, classify
+them using the Gemini API, and apply the correct labels and title prefixes
+(`[Bug]` or `[Feature]`) directly on GitHub.
 
 ```bash
-gemini "I have a list of issues (e.g. 123, 124). For each issue, determine if it is a bug or a feature request. Use the gh CLI tool to act on the GitHub issue: (a) Add the 'type/bug' or 'type/feature' label, and (b) Edit the issue body or title to explicitly denote the type."
+python3 categorize_issues.py --api-key "YOUR_KEY" --search "repo:google-gemini/gemini-cli -label:type/bug -label:type/feature is:open" --limit 50
 ```
 
 **B) Export to JSON:** Once the issues are correctly labeled on GitHub, fetch
diff --git a/scripts/backlog-analysis/categorize_issues.py b/scripts/backlog-analysis/categorize_issues.py
new file mode 100644
index 0000000000..94f049e04e
--- /dev/null
+++ b/scripts/backlog-analysis/categorize_issues.py
@@ -0,0 +1,104 @@
+"""
+Purpose: Automatically categorizes GitHub issues as 'bug' or 'feature' and applies the corresponding label on GitHub.
+It fetches issues matching a search query, uses the Gemini API to classify them, and runs 'gh issue edit' to update GitHub.
+"""
+import argparse
+import urllib.request
+import json
+import subprocess
+import sys
+import concurrent.futures
+
+MODEL = "gemini-3-flash-preview"
+
+def fetch_issues(search_query, limit):
+    cmd = ["gh", "issue", "list", "--search", search_query, "--limit", str(limit), "--json", "number,title,body,url"]
+    try:
+        res = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        return json.loads(res.stdout)
+    except subprocess.CalledProcessError as e:
+        print(f"Error fetching issues: {e.stderr}", file=sys.stderr)
+        sys.exit(1)
+
+def categorize_issue(issue, api_key):
+    url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={api_key}"
+    prompt = f"""
+Analyze the following GitHub issue and determine if it is a bug or a feature request.
+Reply ONLY with a valid JSON object matching exactly this schema, without Markdown formatting:
+{{"type": "bug" | "feature", "reasoning": "brief justification"}}
+
+Issue Title: {issue.get('title')}
+Issue Body: {issue.get('body', '')[:1500]}
+"""
+    data = {
+        "contents": [{"parts": [{"text": prompt}]}],
+        "generationConfig": {"temperature": 0.1}
+    }
+    req = urllib.request.Request(url, data=json.dumps(data).encode('utf-8'), headers={'Content-Type': 'application/json'})
+    try:
+        with urllib.request.urlopen(req) as response:
+            result = json.loads(response.read().decode('utf-8'))
+            text = result['candidates'][0]['content']['parts'][0]['text']
+            
+            # Clean markdown block if present
+            if text.startswith('```json'):
+                text = text[7:]
+            if text.startswith('```'):
+                text = text[3:]
+            if text.endswith('```'):
+                text = text[:-3]
+                
+            parsed = json.loads(text.strip())
+            return parsed
+    except Exception as e:
+        print(f"Error processing issue {issue['number']}: {e}")
+        return None
+
+def process_issue(issue, api_key):
+    print(f"Categorizing Issue #{issue['number']}...")
+    result = categorize_issue(issue, api_key)
+    if not result or 'type' not in result:
+        print(f"Failed to categorize #{issue['number']}.")
+        return
+
+    issue_type = result['type']
+    label = f"type/{issue_type}"
+    print(f"Issue #{issue['number']} is a {issue_type}. Applying label '{label}' on GitHub...")
+    
+    cmd = ["gh", "issue", "edit", str(issue['number']), "--add-label", label]
+    
+    # Prepend the type to the title if it's not already there
+    title = issue.get('title', '')
+    if not title.lower().startswith(f"[{issue_type}]") and not title.lower().startswith(f"{issue_type}:"):
+        new_title = f"[{issue_type.capitalize()}] {title}"
+        cmd.extend(["--title", new_title])
+
+    try:
+        subprocess.run(cmd, capture_output=True, text=True, check=True)
+        print(f"Successfully labeled and updated title for #{issue['number']}.")
+    except subprocess.CalledProcessError as e:
+        print(f"Error labeling #{issue['number']}: {e.stderr}", file=sys.stderr)
+
+def main():
+    parser = argparse.ArgumentParser(description="Auto-categorize GitHub issues (bug vs feature) and apply labels on GitHub.")
+    parser.add_argument("--api-key", required=True, help="Gemini API Key")
+    parser.add_argument("--search", required=True, help="GitHub search query for issues to categorize")
+    parser.add_argument("--limit", type=int, default=50, help="Maximum number of issues to process")
+    args = parser.parse_args()
+
+    print(f"Fetching issues matching: '{args.search}'")
+    issues = fetch_issues(args.search, args.limit)
+    if not issues:
+        print("No issues found matching the query.")
+        return
+
+    print(f"Found {len(issues)} issues to categorize.")
+    
+    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+        futures = [executor.submit(process_issue, issue, args.api_key) for issue in issues]
+        concurrent.futures.wait(futures)
+        
+    print("Done categorizing issues.")
+
+if __name__ == '__main__':
+    main()