Files
gemini-cli/scripts/backlog-analysis/analyze_pipeline.py
T

396 lines
17 KiB
Python
Raw Normal View History

"""
Purpose: A unified pipeline that performs end-to-end effort analysis on a dataset of GitHub issues.
It combines agentic deep analysis, single-turn fallbacks, heuristic validation, and CSV export in a single, efficient execution, avoiding redundant file operations.
"""
import argparse
import json
import urllib.request
import os
import subprocess
import re
import concurrent.futures
import threading
import csv
from datetime import datetime
from pathlib import Path
MODEL = "gemini-3-flash-preview"
file_lock = threading.Lock()
# --- VALIDATION HEURISTICS ---
LARGE_KEYWORDS = [
'windows', 'win32', 'wsl', 'wsl2', 'pty', 'pseudo-terminal', 'child_process', 'spawn', 'sigint', 'sigterm',
'memory leak', 'performance', 'boot time', 'infinite loop', 'hangs', 'freezes', 'crashes', 'race condition',
'intermittent', 'sometimes', 'flickering', 'a2a', 'mcp protocol', 'scheduler', 'event loop', 'websocket',
'stream', 'throughput', 'concurrency', 'deadlock', 'file descriptor', 'architecture', 'refactor'
]
MEDIUM_KEYWORDS = [
'react', 'hook', 'useeffect', 'usestate', 'usememo', 'ink', 'tui', 'ui state', 'parser', 'markdown',
'regex', 'regular expression', 'ansi', 'escape sequence', 'toml', 'schema', 'validation', 'zod',
'promise', 'async', 'await', 'unhandled', 'rejection', 'config', 'settings', 'env', 'environment',
'path resolution', 'symlink', 'git', 'telemetry', 'logging', 'format', 'display', 'rendering',
'clipboard', 'copy', 'paste', 'bracketed', 'interactive', 'dialog', 'modal', 'focus'
]
tools_decl = [
{
"functionDeclarations": [
{
"name": "search_codebase",
"description": "Search the project directory for a string using grep. Returns matching lines and file paths.",
"parameters": {
"type": "OBJECT",
"properties": {
"pattern": {"type": "STRING", "description": "The text pattern to search for"}
},
"required": ["pattern"]
}
},
{
"name": "read_file",
"description": "Read a specific file to understand its context.",
"parameters": {
"type": "OBJECT",
"properties": {
"filepath": {"type": "STRING", "description": "The path to the file"}
},
"required": ["filepath"]
}
}
]
}
]
def call_gemini(messages, url):
data = {
"contents": messages,
"tools": tools_decl,
"generationConfig": {"temperature": 0.1}
}
req = urllib.request.Request(url, data=json.dumps(data).encode('utf-8'), headers={'Content-Type': 'application/json'})
with urllib.request.urlopen(req, timeout=120) as response:
return json.loads(response.read().decode('utf-8'))
def execute_tool(call, project_path):
name = call['name']
args = call.get('args', {})
if name == 'search_codebase':
pattern = args.get('pattern', '').replace('"', '\\"')
try:
cmd = f'grep -rn "{pattern}" "{project_path}" | grep -vE "node_modules|dist|build|\\.test\\." | head -n 20'
res = subprocess.check_output(cmd, shell=True, text=True, stderr=subprocess.STDOUT)
return res if res else "No matches found."
except subprocess.CalledProcessError as e:
return e.output if e.output else "No matches found."
elif name == 'read_file':
filepath = args.get('filepath', '')
if not filepath.startswith('/'):
filepath = os.path.join(project_path, filepath)
try:
if not os.path.exists(filepath):
basename = os.path.basename(filepath)
find_cmd = f'find "{project_path}" -name "{basename}" | head -n 1'
found_path = subprocess.check_output(find_cmd, shell=True, text=True).strip()
if found_path: filepath = found_path
else: return f"File {filepath} not found."
cmd = f'head -n 300 "{filepath}"'
res = subprocess.check_output(cmd, shell=True, text=True, stderr=subprocess.STDOUT)
return res
except Exception as e:
return str(e)
return "Unknown tool"
def analyze_issue_agentic(issue, url, project_path):
system_instruction = """You are a senior software engineer analyzing bug/feature reports.
You MUST use the provided tools to investigate the codebase and pinpoint exactly which files and logic are responsible.
DO NOT GUESS.
Rating Effort Level:
- small (1 day): Localized fix/change (1-2 files), clear logic.
- medium (2-3 days): Harder to trace, state management, touches multiple components.
- large (>3 days): Architectural issues, core protocol changes, or very complex multi-package bugs.
REPRODUCTION RULE:
If a bug is hard to reproduce (specific OS, complex setup, intermittent/flickering), it MUST NOT be rated as small.
Output format (ONLY valid JSON, NO markdown):
{
"analysis": "technical analysis of root cause and fix",
"effort_level": "small|medium|large",
"reasoning": "justification with specific files/lines/logic you found",
"recommended_implementation": "code snippets or specific logic changes (only if small)"
}
"""
prompt = f"{system_instruction}\n\nIssue Title: {issue.get('title')}\nIssue Body: {issue.get('body', '')[:1500]}"
messages = [{"role": "user", "parts": [{"text": prompt}]}]
for turn in range(15): # Limit turns to 15 for efficiency in unified loop
try:
res = call_gemini(messages, url)
candidate = res['candidates'][0]['content']
parts = candidate.get('parts', [])
if 'role' not in candidate: candidate['role'] = 'model'
messages.append(candidate)
function_calls = [p for p in parts if 'functionCall' in p]
if function_calls:
tool_responses = []
for fcall in function_calls:
call_data = fcall['functionCall']
result = execute_tool(call_data, project_path)
tool_responses.append({
"functionResponse": {
"name": call_data['name'],
"response": {"result": result[:5000]}
}
})
messages.append({"role": "user", "parts": tool_responses})
else:
text = parts[0].get('text', '')
if not text: continue
text = text.replace('```json', '').replace('```', '').strip()
return json.loads(text)
except Exception as e:
break
return {"analysis": "Failed to analyze autonomously", "effort_level": "medium", "reasoning": "Agent loop exceeded limit or errored."}
def extract_keywords(text):
words = re.findall(r'\b[A-Z][a-zA-Z0-9]+\b|\b\w+\.tsx?\b|\b\w+Service\b|\b\w+Command\b', text)
words = list(set([w for w in words if len(w) > 4]))
return words[:8]
def search_codebase_static(keywords, project_path):
context = ""
for kw in keywords:
try:
kw_clean = kw.replace('"', '\\"')
cmd = f'grep -rn "{kw_clean}" "{project_path}" | grep -vE "node_modules|dist|build|\\.test\\." | head -n 8'
out = subprocess.check_output(cmd, shell=True, text=True, stderr=subprocess.STDOUT)
if out:
context += f"\n--- Matches for {kw_clean} ---\n{out}\n"
except:
pass
return context
def analyze_issue_single_turn(issue, url, project_path):
title = issue.get('title', '')
body = issue.get('body', '')[:1500]
keywords = extract_keywords(title + " " + body)
code_context = search_codebase_static(keywords, project_path)
prompt = f"""You are a senior software engineer analyzing issues.
Based on the description and codebase search context, pinpoint exactly which files and logic are responsible.
DO NOT GUESS. If the context isn't enough, provide your best technical hypothesis.
Rating Effort Level:
- small (1 day): Localized fix (1-2 files), clear cause.
- medium (2-3 days): Touches multiple components or hard to trace.
- large (>3 days): Architectural issues, Windows/WSL-specific, core protocols.
Issue Title: {title}
Issue Body: {body}
Codebase Search Context:
{code_context[:8000]}
Output ONLY valid JSON (no markdown block):
{{
"analysis": "technical analysis of root cause and fix",
"effort_level": "small|medium|large",
"reasoning": "justification with specific files/lines found"
}}
"""
data = {
"contents": [{"role": "user", "parts": [{"text": prompt}]}],
"generationConfig": {"temperature": 0.1}
}
try:
req = urllib.request.Request(url, data=json.dumps(data).encode('utf-8'), headers={'Content-Type': 'application/json'})
with urllib.request.urlopen(req, timeout=60) as response:
res = json.loads(response.read().decode('utf-8'))
txt = res['candidates'][0]['content']['parts'][0]['text']
txt = txt.replace('```json', '').replace('```', '').strip()
return json.loads(txt)
except Exception as e:
return {"analysis": "Failed to analyze autonomously", "effort_level": "medium", "reasoning": str(e)}
# --- VALIDATION ---
def find_files_in_text(text):
matches = re.findall(r'([\w\.\/\-]+\.(?:ts|tsx|js|json|md))', text)
return set([m for m in matches if not m.startswith('http')])
def resolve_file(filename, project_path):
if os.path.exists(os.path.join(project_path, filename)):
return os.path.join(project_path, filename)
basename = os.path.basename(filename)
for root, dirs, files in os.walk(project_path):
if '.git' in root or 'node_modules' in root: continue
if basename in files: return os.path.join(root, basename)
return None
def validate_effort(issue, project_path):
title = issue.get('title', '').lower()
body = issue.get('body', '').lower()
analysis = issue.get('analysis', '').lower()
reasoning = issue.get('reasoning', '').lower()
combined_text = f"{title} {body} {analysis} {reasoning}"
potential_files = find_files_in_text(combined_text)
actual_files = []
total_lines = 0
for f in potential_files:
resolved = resolve_file(f, project_path)
if resolved and resolved not in [a[0] for a in actual_files]:
try:
with open(resolved, 'r', encoding='utf-8') as file_obj:
lines = sum(1 for line in file_obj)
actual_files.append((resolved, lines))
total_lines += lines
except Exception: pass
num_files = len(actual_files)
keyword_effort = "small"
for kw in LARGE_KEYWORDS:
if re.search(r'\b' + re.escape(kw) + r'\b', combined_text):
keyword_effort = "large"
break
if keyword_effort != "large":
for kw in MEDIUM_KEYWORDS:
if re.search(r'\b' + re.escape(kw) + r'\b', combined_text):
keyword_effort = "medium"
break
effort = "small"
validation_msg = ""
if num_files == 0:
effort = keyword_effort if keyword_effort in ['medium', 'large'] else 'medium'
validation_msg = f"No specific files identified in codebase. Keyword heuristic: {keyword_effort}."
else:
file_details = ", ".join([f"{os.path.basename(f[0])} ({f[1]} lines)" for f in actual_files])
if num_files > 3 or total_lines > 1500 or keyword_effort == "large":
effort = "large"
validation_msg = f"Codebase validation: {num_files} files ({file_details}), {total_lines} total lines. Keyword hint: {keyword_effort}."
elif num_files >= 2 or total_lines > 500 or keyword_effort == "medium":
effort = "medium"
validation_msg = f"Codebase validation: {num_files} files ({file_details}), {total_lines} total lines. Keyword hint: {keyword_effort}."
else:
effort = "small"
validation_msg = f"Codebase validation: {num_files} files ({file_details}), {total_lines} total lines. Appears highly localized."
return effort, validation_msg
def process_pipeline_task(args_tuple):
issue, url, project_path, input_file, all_issues = args_tuple
needs_analysis = not issue.get('analysis') or issue.get('analysis') == "Failed to analyze autonomously" or len(issue.get('analysis', '')) < 30
if needs_analysis:
print(f"[{issue.get('number', 'unknown')}] Starting Agentic Analysis...")
result = analyze_issue_agentic(issue, url, project_path)
if result.get('analysis') == "Failed to analyze autonomously":
print(f"[{issue.get('number', 'unknown')}] Agentic failed. Falling back to Single-Turn Context Analysis...")
result = analyze_issue_single_turn(issue, url, project_path)
issue['analysis'] = result.get('analysis', 'Failed to analyze')
issue['effort_level'] = result.get('effort_level', 'medium')
issue['reasoning'] = result.get('reasoning', 'Could not determine')
if 'recommended_implementation' in result:
issue['recommended_implementation'] = result['recommended_implementation']
with file_lock:
with open(input_file, 'w') as f:
json.dump(all_issues, f, indent=2)
# Validation
old_effort = issue.get('effort_level')
new_effort, validation_reason = validate_effort(issue, project_path)
issue['effort_level'] = new_effort
existing_reasoning = issue.get('reasoning', '')
existing_reasoning = existing_reasoning.split(' | Codebase validation:')[0]
existing_reasoning = existing_reasoning.split(' | No specific files identified')[0]
issue['reasoning'] = f"{existing_reasoning} | {validation_reason}".strip(' |')
if needs_analysis or old_effort != new_effort:
with file_lock:
with open(input_file, 'w') as f:
json.dump(all_issues, f, indent=2)
print(f"[{issue.get('number', 'unknown')}] Completed -> {issue.get('effort_level')}")
return issue
def export_csv(issues, output_csv):
today = datetime.now().strftime("%Y-%m-%d")
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerow([
'Issue ID', 'Title', 'Status', 'Assignee', 'Labels',
'Last Sync', 'Link', 'analysis', 'effort_level',
'reasoning', 'recommended_implementation'
])
for issue in issues:
assignee_list = issue.get('assignees', [])
if isinstance(assignee_list, dict) and 'nodes' in assignee_list: assignee_list = assignee_list['nodes']
assignee = ", ".join([a.get('login', '') for a in assignee_list])
labels_list = issue.get('labels', [])
if isinstance(labels_list, dict) and 'nodes' in labels_list: labels_list = labels_list['nodes']
labels = ", ".join([l.get('name', '') for l in labels_list])
writer.writerow([
issue.get('number'),
issue.get('title', ''),
issue.get('state', 'OPEN'),
assignee,
labels,
today,
issue.get('url', ''),
issue.get('analysis', ''),
issue.get('effort_level', ''),
issue.get('reasoning', ''),
issue.get('recommended_implementation', '')
])
print(f"Exported successfully to {output_csv}")
def main():
parser = argparse.ArgumentParser(description="Unified Effort Analysis Pipeline.")
parser.add_argument("--api-key", required=True, help="Gemini API Key")
parser.add_argument("--input", default="data/bugs.json", help="Input JSON file")
parser.add_argument("--project", default="../../packages", help="Project root to analyze")
parser.add_argument("--workers", type=int, default=4, help="Number of concurrent workers")
args = parser.parse_args()
url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={args.api_key}"
with open(args.input, 'r') as f:
issues = json.load(f)
print(f"Starting unified analysis pipeline on {len(issues)} issues...")
tasks = [(issue, url, args.project, args.input, issues) for issue in issues]
with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
list(executor.map(process_pipeline_task, tasks))
output_csv = args.input.replace('.json', '.csv')
export_csv(issues, output_csv)
print("Pipeline fully complete!")
if __name__ == '__main__':
main()