gemini-cli/scripts/backlog-analysis/categorize_issues.py

"""
Purpose: Automatically categorizes GitHub issues as 'bug' or 'feature' and applies the corresponding label on GitHub.
It fetches issues matching a search URL, uses the Gemini API to classify them, and runs 'gh issue edit' to update GitHub.
"""
import argparse
import urllib.parse
import urllib.request
import json
import subprocess
import sys
import concurrent.futures

MODEL = "gemini-3-flash-preview"

def fetch_issues(search_query, limit):
    cmd = ["gh", "issue", "list", "--search", search_query, "--limit", str(limit), "--json", "number,title,body,url"]
    try:
        res = subprocess.run(cmd, capture_output=True, text=True, check=True)
        return json.loads(res.stdout)
    except subprocess.CalledProcessError as e:
        print(f"Error fetching issues: {e.stderr}", file=sys.stderr)
        sys.exit(1)

def categorize_issue(issue, api_key):
    url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={api_key}"
    prompt = f"""
Analyze the following GitHub issue and determine if it is a bug or a feature request.
Reply ONLY with a valid JSON object matching exactly this schema, without Markdown formatting:
{{"type": "bug" | "feature", "reasoning": "brief justification"}}

Issue Title: {issue.get('title')}
Issue Body: {issue.get('body', '')[:1500]}
"""
    data = {
        "contents": [{"parts": [{"text": prompt}]}],
        "generationConfig": {"temperature": 0.1}
    }
    req = urllib.request.Request(url, data=json.dumps(data).encode('utf-8'), headers={'Content-Type': 'application/json'})
    try:
        with urllib.request.urlopen(req) as response:
            result = json.loads(response.read().decode('utf-8'))
            text = result['candidates'][0]['content']['parts'][0]['text']

            # Clean markdown block if present
            if text.startswith('```json'):
                text = text[7:]
            if text.startswith('```'):
                text = text[3:]
            if text.endswith('```'):
                text = text[:-3]

            parsed = json.loads(text.strip())
            return parsed
    except Exception as e:
        print(f"Error processing issue {issue['number']}: {e}")
        return None

def process_issue(issue, api_key):
    print(f"Categorizing Issue #{issue['number']}...")
    result = categorize_issue(issue, api_key)
    if not result or 'type' not in result:
        print(f"Failed to categorize #{issue['number']}.")
        return

    issue_type = result['type']
    label = f"type/{issue_type}"
    print(f"Issue #{issue['number']} is a {issue_type}. Applying label '{label}' on GitHub...")

    cmd = ["gh", "issue", "edit", str(issue['number']), "--add-label", label]

    # Prepend the type to the title if it's not already there
    title = issue.get('title', '')
    if not title.lower().startswith(f"[{issue_type}]") and not title.lower().startswith(f"{issue_type}:"):
        new_title = f"[{issue_type.capitalize()}] {title}"
        cmd.extend(["--title", new_title])

    try:
        subprocess.run(cmd, capture_output=True, text=True, check=True)
        print(f"Successfully labeled and updated title for #{issue['number']}.")
    except subprocess.CalledProcessError as e:
        print(f"Error labeling #{issue['number']}: {e.stderr}", file=sys.stderr)

def main():
    parser = argparse.ArgumentParser(description="Auto-categorize GitHub issues (bug vs feature) from a GitHub URL and apply labels on GitHub.")
    parser.add_argument("url", help="The full GitHub Issues search URL (e.g., https://github.com/.../issues/?q=...)")
    parser.add_argument("--api-key", required=True, help="Gemini API Key")
    parser.add_argument("--limit", type=int, default=50, help="Maximum number of issues to process")
    args = parser.parse_args()

    parsed_url = urllib.parse.urlparse(args.url)
    query_params = urllib.parse.parse_qs(parsed_url.query)

    if 'q' in query_params:
        search_query = query_params['q'][0]
    else:
        print("Warning: No 'q=' search parameter found in URL. Fetching default open issues.")
        search_query = "is:issue is:open"

    if 'repo:' not in search_query:
        path_parts = [p for p in parsed_url.path.split('/') if p]
        if len(path_parts) >= 2:
            repo = f"{path_parts[0]}/{path_parts[1]}"
            search_query = f"repo:{repo} {search_query}"

    print(f"Fetching issues matching: '{search_query}'")
    issues = fetch_issues(search_query, args.limit)
    if not issues:
        print("No issues found matching the query.")
        return

    print(f"Found {len(issues)} issues to categorize.")

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(process_issue, issue, args.api_key) for issue in issues]
        concurrent.futures.wait(futures)

    print("Done categorizing issues.")

if __name__ == '__main__':
    main()