feat: merge steps 2-5 into a single unified pipeline script run_pipeline.sh

This commit is contained in:
Coco Sheng
2026-05-06 16:26:54 -04:00
parent 6b2ea5dd47
commit ab41447450
5 changed files with 97 additions and 87 deletions
+8 -32
View File
@@ -11,8 +11,8 @@ and determining implementation effort levels for the Gemini CLI project.
validation (e.g., `validate_effort.py`, `inject_manual_fixes.py`).
- `*.py`: Core analysis and export scripts (e.g., `bug_analyzer_final.py`,
`generate_bugs_csv.py`).
- `loop_analyzer.sh`: A shell script for running iterative analysis until all
issues are processed.
- `run_pipeline.sh`: A shell script that orchestrates the entire effort analysis
pipeline end-to-end.
## 📥 Prerequisites: Data Generation
@@ -51,41 +51,17 @@ gemini "Read data/uncategorized.json. For each issue, determine if it is a bug o
_Note: Make sure your `gemini-cli` has permission to execute shell commands if
you want it to apply the labels automatically via `gh`._
### 2. Initial Triage (Static)
### 2. Full Effort Analysis Pipeline
Use this for a quick, first-pass estimation.
Instead of running individual steps manually, you can run the entire analysis
pipeline (Initial Triage -> Deep Agentic Analysis -> Iterative Recovery ->
Validation -> CSV Export) with a single command.
```bash
python3 analyze_bugs.py --api-key "YOUR_KEY"
GEMINI_API_KEY="YOUR_KEY" ./run_pipeline.sh data/bugs.json ../../packages
```
### 3. Deep Agentic Analysis
Uses Gemini as an agent with access to the codebase.
```bash
python3 bug_analyzer_final.py --api-key "YOUR_KEY"
```
### 4. Iterative Analysis
Runs the single-turn analyzer in a loop until all issues have a valid analysis.
```bash
GEMINI_API_KEY="YOUR_KEY" ./loop_analyzer.sh
```
### 5. Validation & Export
Run validation from the utils folder to ensure consistency, then generate a
readable report.
```bash
python3 utils/validate_effort.py
python3 generate_bugs_csv.py
```
### 6. Generic Issue Processing
### 3. Generic Issue Processing
For any other backlog task (e.g., categorizing features, updating labels, or
custom analysis), use the `generic_processor.py`. This script allows you to
+28 -24
View File
@@ -2,24 +2,22 @@
Purpose: Exports analyzed JSON issue data into a human-readable CSV format.
This is typically the final step in the workflow, making the output suitable for sharing, spreadsheet import, or manual review.
"""
import argparse
import json
import csv
from datetime import datetime
BUGS_FILE = 'data/bugs.json'
METADATA_FILE = 'data/metadata_bugs.json'
CSV_FILE = 'data/bugs.csv'
parser = argparse.ArgumentParser(description="Export JSON issues to CSV.")
parser.add_argument("--input", default="data/bugs.json", help="Input JSON file")
parser.add_argument("--output", default="data/bugs.csv", help="Output CSV file")
args = parser.parse_args()
with open(BUGS_FILE, 'r') as f:
bugs = json.load(f)
with open(args.input, 'r') as f:
issues = json.load(f)
with open(METADATA_FILE, 'r') as f:
metadata_list = json.load(f)
today = datetime.now().strftime("%Y-%m-%d")
metadata_map = {m['number']: m for m in metadata_list}
today = "2026-04-21"
with open(CSV_FILE, 'w', newline='', encoding='utf-8') as f:
with open(args.output, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerow([
'Issue ID', 'Title', 'Status', 'Assignee', 'Labels',
@@ -27,25 +25,31 @@ with open(CSV_FILE, 'w', newline='', encoding='utf-8') as f:
'reasoning', 'recommended_implementation'
])
for bug in bugs:
num = bug.get('number')
meta = metadata_map.get(num, {})
for issue in issues:
num = issue.get('number')
assignee = ", ".join([a['login'] for a in meta.get('assignees', [])])
labels = ", ".join([l['name'] for l in meta.get('labels', [])])
assignee_list = issue.get('assignees', [])
if isinstance(assignee_list, dict) and 'nodes' in assignee_list:
assignee_list = assignee_list['nodes']
assignee = ", ".join([a.get('login', '') for a in assignee_list])
labels_list = issue.get('labels', [])
if isinstance(labels_list, dict) and 'nodes' in labels_list:
labels_list = labels_list['nodes']
labels = ", ".join([l.get('name', '') for l in labels_list])
writer.writerow([
num,
bug.get('title', ''),
meta.get('state', 'open'),
issue.get('title', ''),
issue.get('state', 'OPEN'),
assignee,
labels,
today,
bug.get('url', ''),
bug.get('analysis', ''),
bug.get('effort_level', ''),
bug.get('reasoning', ''),
bug.get('recommended_implementation', '')
issue.get('url', ''),
issue.get('analysis', ''),
issue.get('effort_level', ''),
issue.get('reasoning', ''),
issue.get('recommended_implementation', '')
])
print(f"Successfully generated {CSV_FILE}")
print(f"Successfully generated {args.output}")
-20
View File
@@ -1,20 +0,0 @@
#!/bin/bash
# Run from the project root or the scripts/backlog-analysis directory
# This script assumes it's running in the same directory as the python scripts
if [ -z "$GEMINI_API_KEY" ]; then
echo "Error: GEMINI_API_KEY environment variable is required."
echo "Usage: GEMINI_API_KEY=your_key ./loop_analyzer.sh"
exit 1
fi
while true; do
count=$(jq '[.[] | select(.analysis == "Failed to analyze autonomously" or .analysis == null or .analysis == "" or (.analysis | length) < 30)] | length' data/bugs.json)
if [ "$count" -eq 0 ]; then
echo "All bugs processed!"
break
fi
echo "Remaining bugs: $count"
python3 single_turn_bug_analyzer.py --api-key "$GEMINI_API_KEY"
done
python3 generate_bugs_csv.py
+53
View File
@@ -0,0 +1,53 @@
#!/bin/bash
# run_pipeline.sh
# Purpose: Orchestrates the full effort analysis pipeline end-to-end.
if [ -z "$GEMINI_API_KEY" ]; then
echo "Error: GEMINI_API_KEY environment variable is required."
echo "Usage: GEMINI_API_KEY=your_key ./run_pipeline.sh [INPUT_FILE] [PROJECT_DIR]"
exit 1
fi
INPUT_FILE=${1:-"data/bugs.json"}
PROJECT_DIR=${2:-"../../packages"}
OUTPUT_CSV="${INPUT_FILE%.json}.csv"
echo "=========================================="
echo "Step 1: Initial Triage (Static Pass)"
echo "=========================================="
python3 analyze_bugs.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR"
echo ""
echo "=========================================="
echo "Step 2: Deep Agentic Analysis"
echo "=========================================="
python3 bug_analyzer_final.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR"
echo ""
echo "=========================================="
echo "Step 3: Iterative Recovery Analysis"
echo "=========================================="
while true; do
count=$(jq '[.[] | select(.analysis == "Failed to analyze autonomously" or .analysis == null or .analysis == "" or (.analysis | length) < 30)] | length' "$INPUT_FILE")
if [ -z "$count" ] || [ "$count" -eq 0 ]; then
echo "All issues successfully processed!"
break
fi
echo "Remaining unanalyzed issues: $count"
python3 single_turn_bug_analyzer.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR"
done
echo ""
echo "=========================================="
echo "Step 4: Heuristic Validation"
echo "=========================================="
python3 utils/validate_effort.py --input "$INPUT_FILE" --project "$PROJECT_DIR"
echo ""
echo "=========================================="
echo "Step 5: Exporting to CSV"
echo "=========================================="
python3 generate_bugs_csv.py --input "$INPUT_FILE" --output "$OUTPUT_CSV"
echo ""
echo "✅ Pipeline Complete! Results saved to $OUTPUT_CSV"
@@ -2,12 +2,18 @@
Purpose: Runs heuristic post-analysis validation on the AI's effort estimations.
Checks for keywords (like 'Windows', 'WSL', 'PTY') in the issue body to ensure the AI didn't underestimate platform-specific or architecturally complex bugs as 'small'.
"""
import argparse
import json
import re
import os
ISSUES_FILE = 'backlog-analysis/issues.json'
REPO_ROOT = '/Users/cocosheng/gemini-cli'
parser = argparse.ArgumentParser(description="Validate effort levels using heuristics.")
parser.add_argument("--input", default="data/bugs.json", help="Input JSON file containing analyzed issues")
parser.add_argument("--project", default="../../packages", help="Project root for codebase validation")
args = parser.parse_args()
ISSUES_FILE = args.input
REPO_ROOT = args.project
with open(ISSUES_FILE, 'r') as f:
issues = json.load(f)
@@ -35,17 +41,13 @@ SMALL_KEYWORDS = [
]
def find_files_in_text(text):
# match patterns like packages/cli/src/ui/components/Footer.tsx or Footer.tsx
# We will look for anything ending in .ts, .tsx, .js, .json
matches = re.findall(r'([\w\.\/\-]+\.(?:ts|tsx|js|json|md))', text)
# filter out URLs or common false positives
return set([m for m in matches if not m.startswith('http')])
def resolve_file(filename):
if os.path.exists(os.path.join(REPO_ROOT, filename)):
return os.path.join(REPO_ROOT, filename)
# Try searching the repo for the basename
basename = os.path.basename(filename)
for root, dirs, files in os.walk(REPO_ROOT):
if '.git' in root or 'node_modules' in root:
@@ -82,7 +84,6 @@ def analyze_issue(issue):
effort = "small"
validation_msg = ""
# Keyword analysis
keyword_effort = "small"
for kw in LARGE_KEYWORDS:
if re.search(r'\b' + re.escape(kw) + r'\b', combined_text):
@@ -95,9 +96,7 @@ def analyze_issue(issue):
keyword_effort = "medium"
break
# Codebase heuristic
if num_files == 0:
# If no files found, rely strictly on keywords, but default to medium to be safe
effort = keyword_effort if keyword_effort in ['medium', 'large'] else 'medium'
validation_msg = f"No specific files identified in codebase. Keyword heuristic: {keyword_effort}."
else:
@@ -121,9 +120,7 @@ for issue in issues:
issue['effort_level'] = new_effort
# Store the validation reason in the reasoning field
existing_reasoning = issue.get('reasoning', '')
# Strip any previous validation messages
existing_reasoning = existing_reasoning.split(' | Codebase validation:')[0]
existing_reasoning = existing_reasoning.split(' | No specific files identified')[0]