feat: merge steps 2-5 into a single unified pipeline script run_pipeline.sh

2026-05-15 06:12:50 -07:00 · 2026-05-06 16:26:54 -04:00
parent 6b2ea5dd47
commit ab41447450
5 changed files with 97 additions and 87 deletions
@@ -11,8 +11,8 @@ and determining implementation effort levels for the Gemini CLI project.
  validation (e.g., `validate_effort.py`, `inject_manual_fixes.py`).
 - `*.py`: Core analysis and export scripts (e.g., `bug_analyzer_final.py`,
  `generate_bugs_csv.py`).
- `loop_analyzer.sh`: A shell script for running iterative analysis until all
-  issues are processed.
+- `run_pipeline.sh`: A shell script that orchestrates the entire effort analysis
+  pipeline end-to-end.

 ## 📥 Prerequisites: Data Generation

@@ -51,41 +51,17 @@ gemini "Read data/uncategorized.json. For each issue, determine if it is a bug o
 _Note: Make sure your `gemini-cli` has permission to execute shell commands if
 you want it to apply the labels automatically via `gh`._

-### 2. Initial Triage (Static)
+### 2. Full Effort Analysis Pipeline

-Use this for a quick, first-pass estimation.
+Instead of running individual steps manually, you can run the entire analysis
+pipeline (Initial Triage -> Deep Agentic Analysis -> Iterative Recovery ->
+Validation -> CSV Export) with a single command.

 ```bash
-python3 analyze_bugs.py --api-key "YOUR_KEY"
+GEMINI_API_KEY="YOUR_KEY" ./run_pipeline.sh data/bugs.json ../../packages
 ```

-### 3. Deep Agentic Analysis
-
-Uses Gemini as an agent with access to the codebase.
-
-```bash
-python3 bug_analyzer_final.py --api-key "YOUR_KEY"
-```
-
-### 4. Iterative Analysis
-
-Runs the single-turn analyzer in a loop until all issues have a valid analysis.
-
-```bash
-GEMINI_API_KEY="YOUR_KEY" ./loop_analyzer.sh
-```
-
-### 5. Validation & Export
-
-Run validation from the utils folder to ensure consistency, then generate a
-readable report.
-
-```bash
-python3 utils/validate_effort.py
-python3 generate_bugs_csv.py
-```
-
-### 6. Generic Issue Processing
+### 3. Generic Issue Processing

 For any other backlog task (e.g., categorizing features, updating labels, or
 custom analysis), use the `generic_processor.py`. This script allows you to
@@ -2,24 +2,22 @@
 Purpose: Exports analyzed JSON issue data into a human-readable CSV format.
 This is typically the final step in the workflow, making the output suitable for sharing, spreadsheet import, or manual review.
 """
+import argparse
 import json
 import csv
 from datetime import datetime

-BUGS_FILE = 'data/bugs.json'
-METADATA_FILE = 'data/metadata_bugs.json'
-CSV_FILE = 'data/bugs.csv'
+parser = argparse.ArgumentParser(description="Export JSON issues to CSV.")
+parser.add_argument("--input", default="data/bugs.json", help="Input JSON file")
+parser.add_argument("--output", default="data/bugs.csv", help="Output CSV file")
+args = parser.parse_args()

-with open(BUGS_FILE, 'r') as f:
-    bugs = json.load(f)
+with open(args.input, 'r') as f:
+    issues = json.load(f)

-with open(METADATA_FILE, 'r') as f:
-    metadata_list = json.load(f)
+today = datetime.now().strftime("%Y-%m-%d")

-metadata_map = {m['number']: m for m in metadata_list}
-today = "2026-04-21"
-
-with open(CSV_FILE, 'w', newline='', encoding='utf-8') as f:
+with open(args.output, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow([
        'Issue ID', 'Title', 'Status', 'Assignee', 'Labels', 
@@ -27,25 +25,31 @@ with open(CSV_FILE, 'w', newline='', encoding='utf-8') as f:
        'reasoning', 'recommended_implementation'
    ])
    
-    for bug in bugs:
-        num = bug.get('number')
-        meta = metadata_map.get(num, {})
+    for issue in issues:
+        num = issue.get('number')
        
-        assignee = ", ".join([a['login'] for a in meta.get('assignees', [])])
-        labels = ", ".join([l['name'] for l in meta.get('labels', [])])
+        assignee_list = issue.get('assignees', [])
+        if isinstance(assignee_list, dict) and 'nodes' in assignee_list:
+            assignee_list = assignee_list['nodes']
+        assignee = ", ".join([a.get('login', '') for a in assignee_list])
+        
+        labels_list = issue.get('labels', [])
+        if isinstance(labels_list, dict) and 'nodes' in labels_list:
+            labels_list = labels_list['nodes']
+        labels = ", ".join([l.get('name', '') for l in labels_list])
        
        writer.writerow([
            num,
-            bug.get('title', ''),
-            meta.get('state', 'open'),
+            issue.get('title', ''),
+            issue.get('state', 'OPEN'),
            assignee,
            labels,
            today,
-            bug.get('url', ''),
-            bug.get('analysis', ''),
-            bug.get('effort_level', ''),
-            bug.get('reasoning', ''),
-            bug.get('recommended_implementation', '')
+            issue.get('url', ''),
+            issue.get('analysis', ''),
+            issue.get('effort_level', ''),
+            issue.get('reasoning', ''),
+            issue.get('recommended_implementation', '')
        ])

-print(f"Successfully generated {CSV_FILE}")
+print(f"Successfully generated {args.output}")
@@ -1,20 +0,0 @@
-#!/bin/bash
-# Run from the project root or the scripts/backlog-analysis directory
-# This script assumes it's running in the same directory as the python scripts
-
-if [ -z "$GEMINI_API_KEY" ]; then
-  echo "Error: GEMINI_API_KEY environment variable is required."
-  echo "Usage: GEMINI_API_KEY=your_key ./loop_analyzer.sh"
-  exit 1
-fi
-
-while true; do
-  count=$(jq '[.[] | select(.analysis == "Failed to analyze autonomously" or .analysis == null or .analysis == "" or (.analysis | length) < 30)] | length' data/bugs.json)
-  if [ "$count" -eq 0 ]; then
-    echo "All bugs processed!"
-    break
-  fi
-  echo "Remaining bugs: $count"
-  python3 single_turn_bug_analyzer.py --api-key "$GEMINI_API_KEY"
-done
-python3 generate_bugs_csv.py
@@ -0,0 +1,53 @@
+#!/bin/bash
+# run_pipeline.sh
+# Purpose: Orchestrates the full effort analysis pipeline end-to-end.
+
+if [ -z "$GEMINI_API_KEY" ]; then
+  echo "Error: GEMINI_API_KEY environment variable is required."
+  echo "Usage: GEMINI_API_KEY=your_key ./run_pipeline.sh [INPUT_FILE] [PROJECT_DIR]"
+  exit 1
+fi
+
+INPUT_FILE=${1:-"data/bugs.json"}
+PROJECT_DIR=${2:-"../../packages"}
+OUTPUT_CSV="${INPUT_FILE%.json}.csv"
+
+echo "=========================================="
+echo "Step 1: Initial Triage (Static Pass)"
+echo "=========================================="
+python3 analyze_bugs.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR"
+
+echo ""
+echo "=========================================="
+echo "Step 2: Deep Agentic Analysis"
+echo "=========================================="
+python3 bug_analyzer_final.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR"
+
+echo ""
+echo "=========================================="
+echo "Step 3: Iterative Recovery Analysis"
+echo "=========================================="
+while true; do
+  count=$(jq '[.[] | select(.analysis == "Failed to analyze autonomously" or .analysis == null or .analysis == "" or (.analysis | length) < 30)] | length' "$INPUT_FILE")
+  if [ -z "$count" ] || [ "$count" -eq 0 ]; then
+    echo "All issues successfully processed!"
+    break
+  fi
+  echo "Remaining unanalyzed issues: $count"
+  python3 single_turn_bug_analyzer.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR"
+done
+
+echo ""
+echo "=========================================="
+echo "Step 4: Heuristic Validation"
+echo "=========================================="
+python3 utils/validate_effort.py --input "$INPUT_FILE" --project "$PROJECT_DIR"
+
+echo ""
+echo "=========================================="
+echo "Step 5: Exporting to CSV"
+echo "=========================================="
+python3 generate_bugs_csv.py --input "$INPUT_FILE" --output "$OUTPUT_CSV"
+
+echo ""
+echo "✅ Pipeline Complete! Results saved to $OUTPUT_CSV"
@@ -2,12 +2,18 @@
 Purpose: Runs heuristic post-analysis validation on the AI's effort estimations.
 Checks for keywords (like 'Windows', 'WSL', 'PTY') in the issue body to ensure the AI didn't underestimate platform-specific or architecturally complex bugs as 'small'.
 """
+import argparse
 import json
 import re
 import os

-ISSUES_FILE = 'backlog-analysis/issues.json'
-REPO_ROOT = '/Users/cocosheng/gemini-cli'
+parser = argparse.ArgumentParser(description="Validate effort levels using heuristics.")
+parser.add_argument("--input", default="data/bugs.json", help="Input JSON file containing analyzed issues")
+parser.add_argument("--project", default="../../packages", help="Project root for codebase validation")
+args = parser.parse_args()
+
+ISSUES_FILE = args.input
+REPO_ROOT = args.project

 with open(ISSUES_FILE, 'r') as f:
    issues = json.load(f)
@@ -35,17 +41,13 @@ SMALL_KEYWORDS = [
 ]

 def find_files_in_text(text):
-    # match patterns like packages/cli/src/ui/components/Footer.tsx or Footer.tsx
-    # We will look for anything ending in .ts, .tsx, .js, .json
    matches = re.findall(r'([\w\.\/\-]+\.(?:ts|tsx|js|json|md))', text)
-    # filter out URLs or common false positives
    return set([m for m in matches if not m.startswith('http')])

 def resolve_file(filename):
    if os.path.exists(os.path.join(REPO_ROOT, filename)):
        return os.path.join(REPO_ROOT, filename)
    
-    # Try searching the repo for the basename
    basename = os.path.basename(filename)
    for root, dirs, files in os.walk(REPO_ROOT):
        if '.git' in root or 'node_modules' in root:
@@ -82,7 +84,6 @@ def analyze_issue(issue):
    effort = "small"
    validation_msg = ""
    
-    # Keyword analysis
    keyword_effort = "small"
    for kw in LARGE_KEYWORDS:
        if re.search(r'\b' + re.escape(kw) + r'\b', combined_text):
@@ -95,9 +96,7 @@ def analyze_issue(issue):
                keyword_effort = "medium"
                break

-    # Codebase heuristic
    if num_files == 0:
-        # If no files found, rely strictly on keywords, but default to medium to be safe
        effort = keyword_effort if keyword_effort in ['medium', 'large'] else 'medium'
        validation_msg = f"No specific files identified in codebase. Keyword heuristic: {keyword_effort}."
    else:
@@ -121,9 +120,7 @@ for issue in issues:
    
    issue['effort_level'] = new_effort
    
-    # Store the validation reason in the reasoning field
    existing_reasoning = issue.get('reasoning', '')
-    # Strip any previous validation messages
    existing_reasoning = existing_reasoning.split(' | Codebase validation:')[0]
    existing_reasoning = existing_reasoning.split(' | No specific files identified')[0]