From ab41447450444c6ada8f3d0b708c51923e2637de Mon Sep 17 00:00:00 2001 From: Coco Sheng Date: Wed, 6 May 2026 16:26:54 -0400 Subject: [PATCH] feat: merge steps 2-5 into a single unified pipeline script run_pipeline.sh --- scripts/backlog-analysis/README.md | 40 +++----------- scripts/backlog-analysis/generate_bugs_csv.py | 52 +++++++++--------- scripts/backlog-analysis/loop_analyzer.sh | 20 ------- scripts/backlog-analysis/run_pipeline.sh | 53 +++++++++++++++++++ .../backlog-analysis/utils/validate_effort.py | 19 +++---- 5 files changed, 97 insertions(+), 87 deletions(-) delete mode 100755 scripts/backlog-analysis/loop_analyzer.sh create mode 100755 scripts/backlog-analysis/run_pipeline.sh diff --git a/scripts/backlog-analysis/README.md b/scripts/backlog-analysis/README.md index 1d8feec7fb..7306f218bd 100644 --- a/scripts/backlog-analysis/README.md +++ b/scripts/backlog-analysis/README.md @@ -11,8 +11,8 @@ and determining implementation effort levels for the Gemini CLI project. validation (e.g., `validate_effort.py`, `inject_manual_fixes.py`). - `*.py`: Core analysis and export scripts (e.g., `bug_analyzer_final.py`, `generate_bugs_csv.py`). -- `loop_analyzer.sh`: A shell script for running iterative analysis until all - issues are processed. +- `run_pipeline.sh`: A shell script that orchestrates the entire effort analysis + pipeline end-to-end. ## 📥 Prerequisites: Data Generation @@ -51,41 +51,17 @@ gemini "Read data/uncategorized.json. For each issue, determine if it is a bug o _Note: Make sure your `gemini-cli` has permission to execute shell commands if you want it to apply the labels automatically via `gh`._ -### 2. Initial Triage (Static) +### 2. Full Effort Analysis Pipeline -Use this for a quick, first-pass estimation. +Instead of running individual steps manually, you can run the entire analysis +pipeline (Initial Triage -> Deep Agentic Analysis -> Iterative Recovery -> +Validation -> CSV Export) with a single command. ```bash -python3 analyze_bugs.py --api-key "YOUR_KEY" +GEMINI_API_KEY="YOUR_KEY" ./run_pipeline.sh data/bugs.json ../../packages ``` -### 3. Deep Agentic Analysis - -Uses Gemini as an agent with access to the codebase. - -```bash -python3 bug_analyzer_final.py --api-key "YOUR_KEY" -``` - -### 4. Iterative Analysis - -Runs the single-turn analyzer in a loop until all issues have a valid analysis. - -```bash -GEMINI_API_KEY="YOUR_KEY" ./loop_analyzer.sh -``` - -### 5. Validation & Export - -Run validation from the utils folder to ensure consistency, then generate a -readable report. - -```bash -python3 utils/validate_effort.py -python3 generate_bugs_csv.py -``` - -### 6. Generic Issue Processing +### 3. Generic Issue Processing For any other backlog task (e.g., categorizing features, updating labels, or custom analysis), use the `generic_processor.py`. This script allows you to diff --git a/scripts/backlog-analysis/generate_bugs_csv.py b/scripts/backlog-analysis/generate_bugs_csv.py index a4ffeeb070..f745bfa951 100644 --- a/scripts/backlog-analysis/generate_bugs_csv.py +++ b/scripts/backlog-analysis/generate_bugs_csv.py @@ -2,24 +2,22 @@ Purpose: Exports analyzed JSON issue data into a human-readable CSV format. This is typically the final step in the workflow, making the output suitable for sharing, spreadsheet import, or manual review. """ +import argparse import json import csv from datetime import datetime -BUGS_FILE = 'data/bugs.json' -METADATA_FILE = 'data/metadata_bugs.json' -CSV_FILE = 'data/bugs.csv' +parser = argparse.ArgumentParser(description="Export JSON issues to CSV.") +parser.add_argument("--input", default="data/bugs.json", help="Input JSON file") +parser.add_argument("--output", default="data/bugs.csv", help="Output CSV file") +args = parser.parse_args() -with open(BUGS_FILE, 'r') as f: - bugs = json.load(f) +with open(args.input, 'r') as f: + issues = json.load(f) -with open(METADATA_FILE, 'r') as f: - metadata_list = json.load(f) +today = datetime.now().strftime("%Y-%m-%d") -metadata_map = {m['number']: m for m in metadata_list} -today = "2026-04-21" - -with open(CSV_FILE, 'w', newline='', encoding='utf-8') as f: +with open(args.output, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter='\t') writer.writerow([ 'Issue ID', 'Title', 'Status', 'Assignee', 'Labels', @@ -27,25 +25,31 @@ with open(CSV_FILE, 'w', newline='', encoding='utf-8') as f: 'reasoning', 'recommended_implementation' ]) - for bug in bugs: - num = bug.get('number') - meta = metadata_map.get(num, {}) + for issue in issues: + num = issue.get('number') - assignee = ", ".join([a['login'] for a in meta.get('assignees', [])]) - labels = ", ".join([l['name'] for l in meta.get('labels', [])]) + assignee_list = issue.get('assignees', []) + if isinstance(assignee_list, dict) and 'nodes' in assignee_list: + assignee_list = assignee_list['nodes'] + assignee = ", ".join([a.get('login', '') for a in assignee_list]) + + labels_list = issue.get('labels', []) + if isinstance(labels_list, dict) and 'nodes' in labels_list: + labels_list = labels_list['nodes'] + labels = ", ".join([l.get('name', '') for l in labels_list]) writer.writerow([ num, - bug.get('title', ''), - meta.get('state', 'open'), + issue.get('title', ''), + issue.get('state', 'OPEN'), assignee, labels, today, - bug.get('url', ''), - bug.get('analysis', ''), - bug.get('effort_level', ''), - bug.get('reasoning', ''), - bug.get('recommended_implementation', '') + issue.get('url', ''), + issue.get('analysis', ''), + issue.get('effort_level', ''), + issue.get('reasoning', ''), + issue.get('recommended_implementation', '') ]) -print(f"Successfully generated {CSV_FILE}") +print(f"Successfully generated {args.output}") diff --git a/scripts/backlog-analysis/loop_analyzer.sh b/scripts/backlog-analysis/loop_analyzer.sh deleted file mode 100755 index f2a689f43e..0000000000 --- a/scripts/backlog-analysis/loop_analyzer.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -# Run from the project root or the scripts/backlog-analysis directory -# This script assumes it's running in the same directory as the python scripts - -if [ -z "$GEMINI_API_KEY" ]; then - echo "Error: GEMINI_API_KEY environment variable is required." - echo "Usage: GEMINI_API_KEY=your_key ./loop_analyzer.sh" - exit 1 -fi - -while true; do - count=$(jq '[.[] | select(.analysis == "Failed to analyze autonomously" or .analysis == null or .analysis == "" or (.analysis | length) < 30)] | length' data/bugs.json) - if [ "$count" -eq 0 ]; then - echo "All bugs processed!" - break - fi - echo "Remaining bugs: $count" - python3 single_turn_bug_analyzer.py --api-key "$GEMINI_API_KEY" -done -python3 generate_bugs_csv.py diff --git a/scripts/backlog-analysis/run_pipeline.sh b/scripts/backlog-analysis/run_pipeline.sh new file mode 100755 index 0000000000..f5c1960970 --- /dev/null +++ b/scripts/backlog-analysis/run_pipeline.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# run_pipeline.sh +# Purpose: Orchestrates the full effort analysis pipeline end-to-end. + +if [ -z "$GEMINI_API_KEY" ]; then + echo "Error: GEMINI_API_KEY environment variable is required." + echo "Usage: GEMINI_API_KEY=your_key ./run_pipeline.sh [INPUT_FILE] [PROJECT_DIR]" + exit 1 +fi + +INPUT_FILE=${1:-"data/bugs.json"} +PROJECT_DIR=${2:-"../../packages"} +OUTPUT_CSV="${INPUT_FILE%.json}.csv" + +echo "==========================================" +echo "Step 1: Initial Triage (Static Pass)" +echo "==========================================" +python3 analyze_bugs.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR" + +echo "" +echo "==========================================" +echo "Step 2: Deep Agentic Analysis" +echo "==========================================" +python3 bug_analyzer_final.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR" + +echo "" +echo "==========================================" +echo "Step 3: Iterative Recovery Analysis" +echo "==========================================" +while true; do + count=$(jq '[.[] | select(.analysis == "Failed to analyze autonomously" or .analysis == null or .analysis == "" or (.analysis | length) < 30)] | length' "$INPUT_FILE") + if [ -z "$count" ] || [ "$count" -eq 0 ]; then + echo "All issues successfully processed!" + break + fi + echo "Remaining unanalyzed issues: $count" + python3 single_turn_bug_analyzer.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR" +done + +echo "" +echo "==========================================" +echo "Step 4: Heuristic Validation" +echo "==========================================" +python3 utils/validate_effort.py --input "$INPUT_FILE" --project "$PROJECT_DIR" + +echo "" +echo "==========================================" +echo "Step 5: Exporting to CSV" +echo "==========================================" +python3 generate_bugs_csv.py --input "$INPUT_FILE" --output "$OUTPUT_CSV" + +echo "" +echo "✅ Pipeline Complete! Results saved to $OUTPUT_CSV" diff --git a/scripts/backlog-analysis/utils/validate_effort.py b/scripts/backlog-analysis/utils/validate_effort.py index f6df2bf529..f7f9923fbf 100644 --- a/scripts/backlog-analysis/utils/validate_effort.py +++ b/scripts/backlog-analysis/utils/validate_effort.py @@ -2,12 +2,18 @@ Purpose: Runs heuristic post-analysis validation on the AI's effort estimations. Checks for keywords (like 'Windows', 'WSL', 'PTY') in the issue body to ensure the AI didn't underestimate platform-specific or architecturally complex bugs as 'small'. """ +import argparse import json import re import os -ISSUES_FILE = 'backlog-analysis/issues.json' -REPO_ROOT = '/Users/cocosheng/gemini-cli' +parser = argparse.ArgumentParser(description="Validate effort levels using heuristics.") +parser.add_argument("--input", default="data/bugs.json", help="Input JSON file containing analyzed issues") +parser.add_argument("--project", default="../../packages", help="Project root for codebase validation") +args = parser.parse_args() + +ISSUES_FILE = args.input +REPO_ROOT = args.project with open(ISSUES_FILE, 'r') as f: issues = json.load(f) @@ -35,17 +41,13 @@ SMALL_KEYWORDS = [ ] def find_files_in_text(text): - # match patterns like packages/cli/src/ui/components/Footer.tsx or Footer.tsx - # We will look for anything ending in .ts, .tsx, .js, .json matches = re.findall(r'([\w\.\/\-]+\.(?:ts|tsx|js|json|md))', text) - # filter out URLs or common false positives return set([m for m in matches if not m.startswith('http')]) def resolve_file(filename): if os.path.exists(os.path.join(REPO_ROOT, filename)): return os.path.join(REPO_ROOT, filename) - # Try searching the repo for the basename basename = os.path.basename(filename) for root, dirs, files in os.walk(REPO_ROOT): if '.git' in root or 'node_modules' in root: @@ -82,7 +84,6 @@ def analyze_issue(issue): effort = "small" validation_msg = "" - # Keyword analysis keyword_effort = "small" for kw in LARGE_KEYWORDS: if re.search(r'\b' + re.escape(kw) + r'\b', combined_text): @@ -95,9 +96,7 @@ def analyze_issue(issue): keyword_effort = "medium" break - # Codebase heuristic if num_files == 0: - # If no files found, rely strictly on keywords, but default to medium to be safe effort = keyword_effort if keyword_effort in ['medium', 'large'] else 'medium' validation_msg = f"No specific files identified in codebase. Keyword heuristic: {keyword_effort}." else: @@ -121,9 +120,7 @@ for issue in issues: issue['effort_level'] = new_effort - # Store the validation reason in the reasoning field existing_reasoning = issue.get('reasoning', '') - # Strip any previous validation messages existing_reasoning = existing_reasoning.split(' | Codebase validation:')[0] existing_reasoning = existing_reasoning.split(' | No specific files identified')[0]