From ab41447450444c6ada8f3d0b708c51923e2637de Mon Sep 17 00:00:00 2001
From: Coco Sheng <cocosheng@google.com>
Date: Wed, 6 May 2026 16:26:54 -0400
Subject: [PATCH] feat: merge steps 2-5 into a single unified pipeline script
 run_pipeline.sh

---
 scripts/backlog-analysis/README.md            | 40 +++-----------
 scripts/backlog-analysis/generate_bugs_csv.py | 52 +++++++++---------
 scripts/backlog-analysis/loop_analyzer.sh     | 20 -------
 scripts/backlog-analysis/run_pipeline.sh      | 53 +++++++++++++++++++
 .../backlog-analysis/utils/validate_effort.py | 19 +++----
 5 files changed, 97 insertions(+), 87 deletions(-)
 delete mode 100755 scripts/backlog-analysis/loop_analyzer.sh
 create mode 100755 scripts/backlog-analysis/run_pipeline.sh

diff --git a/scripts/backlog-analysis/README.md b/scripts/backlog-analysis/README.md
index 1d8feec7fb..7306f218bd 100644
--- a/scripts/backlog-analysis/README.md
+++ b/scripts/backlog-analysis/README.md
@@ -11,8 +11,8 @@ and determining implementation effort levels for the Gemini CLI project.
   validation (e.g., `validate_effort.py`, `inject_manual_fixes.py`).
 - `*.py`: Core analysis and export scripts (e.g., `bug_analyzer_final.py`,
   `generate_bugs_csv.py`).
-- `loop_analyzer.sh`: A shell script for running iterative analysis until all
-  issues are processed.
+- `run_pipeline.sh`: A shell script that orchestrates the entire effort analysis
+  pipeline end-to-end.
 
 ## 📥 Prerequisites: Data Generation
 
@@ -51,41 +51,17 @@ gemini "Read data/uncategorized.json. For each issue, determine if it is a bug o
 _Note: Make sure your `gemini-cli` has permission to execute shell commands if
 you want it to apply the labels automatically via `gh`._
 
-### 2. Initial Triage (Static)
+### 2. Full Effort Analysis Pipeline
 
-Use this for a quick, first-pass estimation.
+Instead of running individual steps manually, you can run the entire analysis
+pipeline (Initial Triage -> Deep Agentic Analysis -> Iterative Recovery ->
+Validation -> CSV Export) with a single command.
 
 ```bash
-python3 analyze_bugs.py --api-key "YOUR_KEY"
+GEMINI_API_KEY="YOUR_KEY" ./run_pipeline.sh data/bugs.json ../../packages
 ```
 
-### 3. Deep Agentic Analysis
-
-Uses Gemini as an agent with access to the codebase.
-
-```bash
-python3 bug_analyzer_final.py --api-key "YOUR_KEY"
-```
-
-### 4. Iterative Analysis
-
-Runs the single-turn analyzer in a loop until all issues have a valid analysis.
-
-```bash
-GEMINI_API_KEY="YOUR_KEY" ./loop_analyzer.sh
-```
-
-### 5. Validation & Export
-
-Run validation from the utils folder to ensure consistency, then generate a
-readable report.
-
-```bash
-python3 utils/validate_effort.py
-python3 generate_bugs_csv.py
-```
-
-### 6. Generic Issue Processing
+### 3. Generic Issue Processing
 
 For any other backlog task (e.g., categorizing features, updating labels, or
 custom analysis), use the `generic_processor.py`. This script allows you to
diff --git a/scripts/backlog-analysis/generate_bugs_csv.py b/scripts/backlog-analysis/generate_bugs_csv.py
index a4ffeeb070..f745bfa951 100644
--- a/scripts/backlog-analysis/generate_bugs_csv.py
+++ b/scripts/backlog-analysis/generate_bugs_csv.py
@@ -2,24 +2,22 @@
 Purpose: Exports analyzed JSON issue data into a human-readable CSV format.
 This is typically the final step in the workflow, making the output suitable for sharing, spreadsheet import, or manual review.
 """
+import argparse
 import json
 import csv
 from datetime import datetime
 
-BUGS_FILE = 'data/bugs.json'
-METADATA_FILE = 'data/metadata_bugs.json'
-CSV_FILE = 'data/bugs.csv'
+parser = argparse.ArgumentParser(description="Export JSON issues to CSV.")
+parser.add_argument("--input", default="data/bugs.json", help="Input JSON file")
+parser.add_argument("--output", default="data/bugs.csv", help="Output CSV file")
+args = parser.parse_args()
 
-with open(BUGS_FILE, 'r') as f:
-    bugs = json.load(f)
+with open(args.input, 'r') as f:
+    issues = json.load(f)
 
-with open(METADATA_FILE, 'r') as f:
-    metadata_list = json.load(f)
+today = datetime.now().strftime("%Y-%m-%d")
 
-metadata_map = {m['number']: m for m in metadata_list}
-today = "2026-04-21"
-
-with open(CSV_FILE, 'w', newline='', encoding='utf-8') as f:
+with open(args.output, 'w', newline='', encoding='utf-8') as f:
     writer = csv.writer(f, delimiter='\t')
     writer.writerow([
         'Issue ID', 'Title', 'Status', 'Assignee', 'Labels', 
@@ -27,25 +25,31 @@ with open(CSV_FILE, 'w', newline='', encoding='utf-8') as f:
         'reasoning', 'recommended_implementation'
     ])
     
-    for bug in bugs:
-        num = bug.get('number')
-        meta = metadata_map.get(num, {})
+    for issue in issues:
+        num = issue.get('number')
         
-        assignee = ", ".join([a['login'] for a in meta.get('assignees', [])])
-        labels = ", ".join([l['name'] for l in meta.get('labels', [])])
+        assignee_list = issue.get('assignees', [])
+        if isinstance(assignee_list, dict) and 'nodes' in assignee_list:
+            assignee_list = assignee_list['nodes']
+        assignee = ", ".join([a.get('login', '') for a in assignee_list])
+        
+        labels_list = issue.get('labels', [])
+        if isinstance(labels_list, dict) and 'nodes' in labels_list:
+            labels_list = labels_list['nodes']
+        labels = ", ".join([l.get('name', '') for l in labels_list])
         
         writer.writerow([
             num,
-            bug.get('title', ''),
-            meta.get('state', 'open'),
+            issue.get('title', ''),
+            issue.get('state', 'OPEN'),
             assignee,
             labels,
             today,
-            bug.get('url', ''),
-            bug.get('analysis', ''),
-            bug.get('effort_level', ''),
-            bug.get('reasoning', ''),
-            bug.get('recommended_implementation', '')
+            issue.get('url', ''),
+            issue.get('analysis', ''),
+            issue.get('effort_level', ''),
+            issue.get('reasoning', ''),
+            issue.get('recommended_implementation', '')
         ])
 
-print(f"Successfully generated {CSV_FILE}")
+print(f"Successfully generated {args.output}")
diff --git a/scripts/backlog-analysis/loop_analyzer.sh b/scripts/backlog-analysis/loop_analyzer.sh
deleted file mode 100755
index f2a689f43e..0000000000
--- a/scripts/backlog-analysis/loop_analyzer.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-# Run from the project root or the scripts/backlog-analysis directory
-# This script assumes it's running in the same directory as the python scripts
-
-if [ -z "$GEMINI_API_KEY" ]; then
-  echo "Error: GEMINI_API_KEY environment variable is required."
-  echo "Usage: GEMINI_API_KEY=your_key ./loop_analyzer.sh"
-  exit 1
-fi
-
-while true; do
-  count=$(jq '[.[] | select(.analysis == "Failed to analyze autonomously" or .analysis == null or .analysis == "" or (.analysis | length) < 30)] | length' data/bugs.json)
-  if [ "$count" -eq 0 ]; then
-    echo "All bugs processed!"
-    break
-  fi
-  echo "Remaining bugs: $count"
-  python3 single_turn_bug_analyzer.py --api-key "$GEMINI_API_KEY"
-done
-python3 generate_bugs_csv.py
diff --git a/scripts/backlog-analysis/run_pipeline.sh b/scripts/backlog-analysis/run_pipeline.sh
new file mode 100755
index 0000000000..f5c1960970
--- /dev/null
+++ b/scripts/backlog-analysis/run_pipeline.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# run_pipeline.sh
+# Purpose: Orchestrates the full effort analysis pipeline end-to-end.
+
+if [ -z "$GEMINI_API_KEY" ]; then
+  echo "Error: GEMINI_API_KEY environment variable is required."
+  echo "Usage: GEMINI_API_KEY=your_key ./run_pipeline.sh [INPUT_FILE] [PROJECT_DIR]"
+  exit 1
+fi
+
+INPUT_FILE=${1:-"data/bugs.json"}
+PROJECT_DIR=${2:-"../../packages"}
+OUTPUT_CSV="${INPUT_FILE%.json}.csv"
+
+echo "=========================================="
+echo "Step 1: Initial Triage (Static Pass)"
+echo "=========================================="
+python3 analyze_bugs.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR"
+
+echo ""
+echo "=========================================="
+echo "Step 2: Deep Agentic Analysis"
+echo "=========================================="
+python3 bug_analyzer_final.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR"
+
+echo ""
+echo "=========================================="
+echo "Step 3: Iterative Recovery Analysis"
+echo "=========================================="
+while true; do
+  count=$(jq '[.[] | select(.analysis == "Failed to analyze autonomously" or .analysis == null or .analysis == "" or (.analysis | length) < 30)] | length' "$INPUT_FILE")
+  if [ -z "$count" ] || [ "$count" -eq 0 ]; then
+    echo "All issues successfully processed!"
+    break
+  fi
+  echo "Remaining unanalyzed issues: $count"
+  python3 single_turn_bug_analyzer.py --api-key "$GEMINI_API_KEY" --input "$INPUT_FILE" --project "$PROJECT_DIR"
+done
+
+echo ""
+echo "=========================================="
+echo "Step 4: Heuristic Validation"
+echo "=========================================="
+python3 utils/validate_effort.py --input "$INPUT_FILE" --project "$PROJECT_DIR"
+
+echo ""
+echo "=========================================="
+echo "Step 5: Exporting to CSV"
+echo "=========================================="
+python3 generate_bugs_csv.py --input "$INPUT_FILE" --output "$OUTPUT_CSV"
+
+echo ""
+echo "✅ Pipeline Complete! Results saved to $OUTPUT_CSV"
diff --git a/scripts/backlog-analysis/utils/validate_effort.py b/scripts/backlog-analysis/utils/validate_effort.py
index f6df2bf529..f7f9923fbf 100644
--- a/scripts/backlog-analysis/utils/validate_effort.py
+++ b/scripts/backlog-analysis/utils/validate_effort.py
@@ -2,12 +2,18 @@
 Purpose: Runs heuristic post-analysis validation on the AI's effort estimations.
 Checks for keywords (like 'Windows', 'WSL', 'PTY') in the issue body to ensure the AI didn't underestimate platform-specific or architecturally complex bugs as 'small'.
 """
+import argparse
 import json
 import re
 import os
 
-ISSUES_FILE = 'backlog-analysis/issues.json'
-REPO_ROOT = '/Users/cocosheng/gemini-cli'
+parser = argparse.ArgumentParser(description="Validate effort levels using heuristics.")
+parser.add_argument("--input", default="data/bugs.json", help="Input JSON file containing analyzed issues")
+parser.add_argument("--project", default="../../packages", help="Project root for codebase validation")
+args = parser.parse_args()
+
+ISSUES_FILE = args.input
+REPO_ROOT = args.project
 
 with open(ISSUES_FILE, 'r') as f:
     issues = json.load(f)
@@ -35,17 +41,13 @@ SMALL_KEYWORDS = [
 ]
 
 def find_files_in_text(text):
-    # match patterns like packages/cli/src/ui/components/Footer.tsx or Footer.tsx
-    # We will look for anything ending in .ts, .tsx, .js, .json
     matches = re.findall(r'([\w\.\/\-]+\.(?:ts|tsx|js|json|md))', text)
-    # filter out URLs or common false positives
     return set([m for m in matches if not m.startswith('http')])
 
 def resolve_file(filename):
     if os.path.exists(os.path.join(REPO_ROOT, filename)):
         return os.path.join(REPO_ROOT, filename)
     
-    # Try searching the repo for the basename
     basename = os.path.basename(filename)
     for root, dirs, files in os.walk(REPO_ROOT):
         if '.git' in root or 'node_modules' in root:
@@ -82,7 +84,6 @@ def analyze_issue(issue):
     effort = "small"
     validation_msg = ""
     
-    # Keyword analysis
     keyword_effort = "small"
     for kw in LARGE_KEYWORDS:
         if re.search(r'\b' + re.escape(kw) + r'\b', combined_text):
@@ -95,9 +96,7 @@ def analyze_issue(issue):
                 keyword_effort = "medium"
                 break
 
-    # Codebase heuristic
     if num_files == 0:
-        # If no files found, rely strictly on keywords, but default to medium to be safe
         effort = keyword_effort if keyword_effort in ['medium', 'large'] else 'medium'
         validation_msg = f"No specific files identified in codebase. Keyword heuristic: {keyword_effort}."
     else:
@@ -121,9 +120,7 @@ for issue in issues:
     
     issue['effort_level'] = new_effort
     
-    # Store the validation reason in the reasoning field
     existing_reasoning = issue.get('reasoning', '')
-    # Strip any previous validation messages
     existing_reasoning = existing_reasoning.split(' | Codebase validation:')[0]
     existing_reasoning = existing_reasoning.split(' | No specific files identified')[0]