feat: consolidate multi-stage analysis pipeline into a single, efficient python script

2026-05-31 06:02:47 -07:00 · 2026-05-06 16:35:38 -04:00
parent fb236ea0e2
commit 4398261feb
8 changed files with 405 additions and 682 deletions
@@ -1,135 +0,0 @@
-"""
-Purpose: Runs heuristic post-analysis validation on the AI's effort estimations.
-Checks for keywords (like 'Windows', 'WSL', 'PTY') in the issue body to ensure the AI didn't underestimate platform-specific or architecturally complex bugs as 'small'.
-"""
-import argparse
-import json
-import re
-import os
-
-parser = argparse.ArgumentParser(description="Validate effort levels using heuristics.")
-parser.add_argument("--input", default="data/bugs.json", help="Input JSON file containing analyzed issues")
-parser.add_argument("--project", default="../../packages", help="Project root for codebase validation")
-args = parser.parse_args()
-
-ISSUES_FILE = args.input
-REPO_ROOT = args.project
-
-with open(ISSUES_FILE, 'r') as f:
-    issues = json.load(f)
-
-# Stricter criteria keywords
-LARGE_KEYWORDS = [
-    'windows', 'win32', 'wsl', 'wsl2', 'pty', 'pseudo-terminal', 'child_process', 'spawn', 'sigint', 'sigterm',
-    'memory leak', 'performance', 'boot time', 'infinite loop', 'hangs', 'freezes', 'crashes', 'race condition',
-    'intermittent', 'sometimes', 'flickering', 'a2a', 'mcp protocol', 'scheduler', 'event loop', 'websocket',
-    'stream', 'throughput', 'concurrency', 'deadlock', 'file descriptor', 'architecture', 'refactor'
-]
-
-MEDIUM_KEYWORDS = [
-    'react', 'hook', 'useeffect', 'usestate', 'usememo', 'ink', 'tui', 'ui state', 'parser', 'markdown',
-    'regex', 'regular expression', 'ansi', 'escape sequence', 'toml', 'schema', 'validation', 'zod',
-    'promise', 'async', 'await', 'unhandled', 'rejection', 'config', 'settings', 'env', 'environment',
-    'path resolution', 'symlink', 'git', 'telemetry', 'logging', 'format', 'display', 'rendering',
-    'clipboard', 'copy', 'paste', 'bracketed', 'interactive', 'dialog', 'modal', 'focus'
-]
-
-SMALL_KEYWORDS = [
-    'typo', 'spelling', 'rename', 'string', 'constant', 'css', 'color', 'theme.status', 'padding', 'margin',
-    'error message', 'econnreset', 'enotdir', 'etimedout', 'documentation', 'jsdoc', 'readme', 'help text',
-    'flag', 'version string', 'static value'
-]
-
-def find_files_in_text(text):
-    matches = re.findall(r'([\w\.\/\-]+\.(?:ts|tsx|js|json|md))', text)
-    return set([m for m in matches if not m.startswith('http')])
-
-def resolve_file(filename):
-    if os.path.exists(os.path.join(REPO_ROOT, filename)):
-        return os.path.join(REPO_ROOT, filename)
-    
-    basename = os.path.basename(filename)
-    for root, dirs, files in os.walk(REPO_ROOT):
-        if '.git' in root or 'node_modules' in root:
-            continue
-        if basename in files:
-            return os.path.join(root, basename)
-    return None
-
-def analyze_issue(issue):
-    title = issue.get('title', '').lower()
-    body = issue.get('body', '').lower()
-    analysis = issue.get('analysis', '').lower()
-    reasoning = issue.get('reasoning', '').lower()
-    
-    combined_text = f"{title} {body} {analysis} {reasoning}"
-    
-    potential_files = find_files_in_text(combined_text)
-    actual_files = []
-    total_lines = 0
-    
-    for f in potential_files:
-        resolved = resolve_file(f)
-        if resolved and resolved not in [a[0] for a in actual_files]:
-            try:
-                with open(resolved, 'r', encoding='utf-8') as file_obj:
-                    lines = sum(1 for line in file_obj)
-                    actual_files.append((resolved, lines))
-                    total_lines += lines
-            except Exception:
-                pass
-                
-    num_files = len(actual_files)
-    
-    effort = "small"
-    validation_msg = ""
-    
-    keyword_effort = "small"
-    for kw in LARGE_KEYWORDS:
-        if re.search(r'\b' + re.escape(kw) + r'\b', combined_text):
-            keyword_effort = "large"
-            break
-            
-    if keyword_effort != "large":
-        for kw in MEDIUM_KEYWORDS:
-            if re.search(r'\b' + re.escape(kw) + r'\b', combined_text):
-                keyword_effort = "medium"
-                break
-
-    if num_files == 0:
-        effort = keyword_effort if keyword_effort in ['medium', 'large'] else 'medium'
-        validation_msg = f"No specific files identified in codebase. Keyword heuristic: {keyword_effort}."
-    else:
-        file_details = ", ".join([f"{os.path.basename(f[0])} ({f[1]} lines)" for f in actual_files])
-        if num_files > 3 or total_lines > 1500 or keyword_effort == "large":
-            effort = "large"
-            validation_msg = f"Codebase validation: {num_files} files ({file_details}), {total_lines} total lines. Keyword hint: {keyword_effort}."
-        elif num_files >= 2 or total_lines > 500 or keyword_effort == "medium":
-            effort = "medium"
-            validation_msg = f"Codebase validation: {num_files} files ({file_details}), {total_lines} total lines. Keyword hint: {keyword_effort}."
-        else:
-            effort = "small"
-            validation_msg = f"Codebase validation: {num_files} files ({file_details}), {total_lines} total lines. Appears highly localized."
-
-    return effort, validation_msg
-
-updated_count = 0
-for issue in issues:
-    old_effort = issue.get('effort_level')
-    new_effort, validation_reason = analyze_issue(issue)
-    
-    issue['effort_level'] = new_effort
-    
-    existing_reasoning = issue.get('reasoning', '')
-    existing_reasoning = existing_reasoning.split(' | Codebase validation:')[0]
-    existing_reasoning = existing_reasoning.split(' | No specific files identified')[0]
-    
-    issue['reasoning'] = f"{existing_reasoning} | {validation_reason}".strip(' |')
-    
-    if old_effort != new_effort:
-        updated_count += 1
-
-with open(ISSUES_FILE, 'w') as f:
-    json.dump(issues, f, indent=2)
-
-print(f"Successfully re-evaluated and updated {updated_count} issues. Codebase validated.")