feat: add fetch_from_url.py to easily download issues via standard github search urls

2026-05-14 22:02:59 -07:00 · 2026-05-06 16:22:59 -04:00
parent 08090cc31c
commit 966758875e
2 changed files with 84 additions and 24 deletions
@@ -17,35 +17,21 @@ and determining implementation effort levels for the Gemini CLI project.
 ## 📥 Prerequisites: Data Generation

 Before running the analyzers, you must fetch the issue data from GitHub. The
-scripts expect the data in JSON format, which can be easily generated using the
-[GitHub CLI (`gh`)](https://cli.github.com/).
+scripts expect the data in JSON format.

-### Generating `bugs.json`
+The easiest way to generate this is to simply copy the URL from your browser
+when looking at a filtered list of issues on GitHub, and pass it to our fetcher
+script.

-To extract all open bugs with the necessary fields (`number`, `title`, `body`,
-and `url`):
+_(Note: You must have the [GitHub CLI (`gh`)](https://cli.github.com/) installed
+and authenticated)._

 ```bash
-gh issue list \
-  --repo google-gemini/gemini-cli \
-  --label "type/bug" \
-  --state open \
-  --limit 1000 \
-  --json number,title,body,url > data/bugs.json
-```
+# Fetch any filtered list of issues directly from a GitHub URL
+python3 fetch_from_url.py "https://github.com/google-gemini/gemini-cli/issues/?q=type%3ABug+is%3Aopen" --output data/bugs.json

-### Generating `issues.json` (Features/General)
-
-To extract general issues or features, simply change the label. You may also
-want additional fields like `labels` or `assignees`:
-
-```bash
-gh issue list \
-  --repo google-gemini/gemini-cli \
-  --label "type/feature" \
-  --state open \
-  --limit 1000 \
-  --json number,title,body,url,labels,assignees,state > data/issues.json
+# Fetch features to a different file
+python3 fetch_from_url.py "https://github.com/google-gemini/gemini-cli/issues/?q=type%3AFeature+is%3Aopen" --output data/issues.json
 ```

 ## 🚀 Workflows
@@ -0,0 +1,74 @@
+"""
+Purpose: Fetches issues from GitHub using a standard GitHub Issues search URL.
+It parses the URL to extract the search query (the 'q=' parameter) and uses the GitHub CLI ('gh') to download the matching issues into a JSON file, ready for analysis.
+"""
+import argparse
+import urllib.parse
+import subprocess
+import json
+import sys
+import os
+
+def main():
+    parser = argparse.ArgumentParser(description="Fetch GitHub issues from a search URL.")
+    parser.add_argument("url", help="The full GitHub Issues search URL (e.g., https://github.com/.../issues/?q=...)")
+    parser.add_argument("--output", default="data/bugs.json", help="Path to save the output JSON (default: data/bugs.json)")
+    parser.add_argument("--limit", type=int, default=1000, help="Maximum number of issues to fetch")
+    
+    args = parser.parse_args()
+    
+    parsed_url = urllib.parse.urlparse(args.url)
+    query_params = urllib.parse.parse_qs(parsed_url.query)
+    
+    # Extract the 'q' parameter. If there isn't one, we default to whatever the URL represents 
+    # but gh CLI requires explicit search terms or just fetching the repo.
+    if 'q' in query_params:
+        search_query = query_params['q'][0]
+    else:
+        print("Warning: No 'q=' search parameter found in URL. Fetching default open issues.")
+        search_query = "is:issue is:open"
+
+    # Ensure repo context is attached if not already in the query
+    if 'repo:' not in search_query:
+        # Try to extract repo from URL path (e.g., /google-gemini/gemini-cli/issues)
+        path_parts = [p for p in parsed_url.path.split('/') if p]
+        if len(path_parts) >= 2:
+            repo = f"{path_parts[0]}/{path_parts[1]}"
+            search_query = f"repo:{repo} {search_query}"
+            
+    print(f"Extracted Search Query: {search_query}")
+    print(f"Fetching up to {args.limit} issues...")
+    
+    # Required fields for our analysis tools
+    fields = "number,title,body,url,labels,assignees,state"
+    
+    cmd = [
+        "gh", "issue", "list", 
+        "--search", search_query, 
+        "--limit", str(args.limit),
+        "--json", fields
+    ]
+    
+    try:
+        # Run gh CLI
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        issues = json.loads(result.stdout)
+        
+        # Ensure data directory exists
+        os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
+        
+        with open(args.output, 'w') as f:
+            json.dump(issues, f, indent=2)
+            
+        print(f"Successfully fetched {len(issues)} issues and saved to {args.output}")
+        
+    except subprocess.CalledProcessError as e:
+        print(f"Error running GitHub CLI: {e.stderr}", file=sys.stderr)
+        print("Make sure you have the 'gh' CLI installed and authenticated (gh auth login).", file=sys.stderr)
+        sys.exit(1)
+    except json.JSONDecodeError:
+        print("Error: GitHub CLI did not return valid JSON.", file=sys.stderr)
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()