diff --git a/scripts/backlog-analysis/README.md b/scripts/backlog-analysis/README.md index 96e83ba5fe..0415033617 100644 --- a/scripts/backlog-analysis/README.md +++ b/scripts/backlog-analysis/README.md @@ -17,35 +17,21 @@ and determining implementation effort levels for the Gemini CLI project. ## 📥 Prerequisites: Data Generation Before running the analyzers, you must fetch the issue data from GitHub. The -scripts expect the data in JSON format, which can be easily generated using the -[GitHub CLI (`gh`)](https://cli.github.com/). +scripts expect the data in JSON format. -### Generating `bugs.json` +The easiest way to generate this is to simply copy the URL from your browser +when looking at a filtered list of issues on GitHub, and pass it to our fetcher +script. -To extract all open bugs with the necessary fields (`number`, `title`, `body`, -and `url`): +_(Note: You must have the [GitHub CLI (`gh`)](https://cli.github.com/) installed +and authenticated)._ ```bash -gh issue list \ - --repo google-gemini/gemini-cli \ - --label "type/bug" \ - --state open \ - --limit 1000 \ - --json number,title,body,url > data/bugs.json -``` +# Fetch any filtered list of issues directly from a GitHub URL +python3 fetch_from_url.py "https://github.com/google-gemini/gemini-cli/issues/?q=type%3ABug+is%3Aopen" --output data/bugs.json -### Generating `issues.json` (Features/General) - -To extract general issues or features, simply change the label. You may also -want additional fields like `labels` or `assignees`: - -```bash -gh issue list \ - --repo google-gemini/gemini-cli \ - --label "type/feature" \ - --state open \ - --limit 1000 \ - --json number,title,body,url,labels,assignees,state > data/issues.json +# Fetch features to a different file +python3 fetch_from_url.py "https://github.com/google-gemini/gemini-cli/issues/?q=type%3AFeature+is%3Aopen" --output data/issues.json ``` ## 🚀 Workflows diff --git a/scripts/backlog-analysis/fetch_from_url.py b/scripts/backlog-analysis/fetch_from_url.py new file mode 100644 index 0000000000..f5ba5319b6 --- /dev/null +++ b/scripts/backlog-analysis/fetch_from_url.py @@ -0,0 +1,74 @@ +""" +Purpose: Fetches issues from GitHub using a standard GitHub Issues search URL. +It parses the URL to extract the search query (the 'q=' parameter) and uses the GitHub CLI ('gh') to download the matching issues into a JSON file, ready for analysis. +""" +import argparse +import urllib.parse +import subprocess +import json +import sys +import os + +def main(): + parser = argparse.ArgumentParser(description="Fetch GitHub issues from a search URL.") + parser.add_argument("url", help="The full GitHub Issues search URL (e.g., https://github.com/.../issues/?q=...)") + parser.add_argument("--output", default="data/bugs.json", help="Path to save the output JSON (default: data/bugs.json)") + parser.add_argument("--limit", type=int, default=1000, help="Maximum number of issues to fetch") + + args = parser.parse_args() + + parsed_url = urllib.parse.urlparse(args.url) + query_params = urllib.parse.parse_qs(parsed_url.query) + + # Extract the 'q' parameter. If there isn't one, we default to whatever the URL represents + # but gh CLI requires explicit search terms or just fetching the repo. + if 'q' in query_params: + search_query = query_params['q'][0] + else: + print("Warning: No 'q=' search parameter found in URL. Fetching default open issues.") + search_query = "is:issue is:open" + + # Ensure repo context is attached if not already in the query + if 'repo:' not in search_query: + # Try to extract repo from URL path (e.g., /google-gemini/gemini-cli/issues) + path_parts = [p for p in parsed_url.path.split('/') if p] + if len(path_parts) >= 2: + repo = f"{path_parts[0]}/{path_parts[1]}" + search_query = f"repo:{repo} {search_query}" + + print(f"Extracted Search Query: {search_query}") + print(f"Fetching up to {args.limit} issues...") + + # Required fields for our analysis tools + fields = "number,title,body,url,labels,assignees,state" + + cmd = [ + "gh", "issue", "list", + "--search", search_query, + "--limit", str(args.limit), + "--json", fields + ] + + try: + # Run gh CLI + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + issues = json.loads(result.stdout) + + # Ensure data directory exists + os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) + + with open(args.output, 'w') as f: + json.dump(issues, f, indent=2) + + print(f"Successfully fetched {len(issues)} issues and saved to {args.output}") + + except subprocess.CalledProcessError as e: + print(f"Error running GitHub CLI: {e.stderr}", file=sys.stderr) + print("Make sure you have the 'gh' CLI installed and authenticated (gh auth login).", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError: + print("Error: GitHub CLI did not return valid JSON.", file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__": + main()