refactor: remove hardcoded api keys and paths to make core analyzers generic

2026-05-15 06:12:50 -07:00 · 2026-05-06 16:05:35 -04:00
parent 268fe7cbe1
commit ca9c2009cc
5 changed files with 109 additions and 77 deletions
@@ -21,7 +21,7 @@ and determining implementation effort levels for the Gemini CLI project.
 Use this for a quick, first-pass estimation.

 ```bash
-python3 analyze_bugs.py
+python3 analyze_bugs.py --api-key "YOUR_KEY"
 ```

 ### 2. Deep Agentic Analysis
@@ -29,7 +29,7 @@ python3 analyze_bugs.py
 Uses Gemini as an agent with access to the codebase.

 ```bash
-python3 bug_analyzer_final.py
+python3 bug_analyzer_final.py --api-key "YOUR_KEY"
 ```

 ### 3. Iterative Analysis
@@ -37,7 +37,7 @@ python3 bug_analyzer_final.py
 Runs the single-turn analyzer in a loop until all issues have a valid analysis.

 ```bash
-./loop_analyzer.sh
+GEMINI_API_KEY="YOUR_KEY" ./loop_analyzer.sh
 ```

 ### 4. Validation & Export
@@ -2,17 +2,11 @@ import json
 import urllib.request
 import urllib.error
 import os
+import argparse
 import concurrent.futures
 from pathlib import Path

-API_KEY = "REDACTED_API_KEY"
 MODEL = "gemini-3-flash-preview"
-URL = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={API_KEY}"
-
-BUGS_FILE = 'data/bugs.json'
-
-with open(BUGS_FILE, 'r') as f:
-    bugs = json.load(f)

 # Collect basic directory structure to provide as context
 def get_tree(path, max_depth=3):
@@ -28,16 +22,14 @@ def get_tree(path, max_depth=3):
        indent = '  ' * len(depth)
        tree.append(f"{indent}{Path(root).name}/")
        for f in files:
-            if f.endswith(('.ts', '.tsx', '.js', '.json', '.toml', '.md')):
+            if f.endswith(('.ts', '.tsx', '.js', '.json', '.toml', '.md', '.py', '.sh')):
                tree.append(f"{indent}  {f}")
    return "\n".join(tree)

-tree_context = get_tree('../../packages')
-
-def analyze_bug(bug):
+def analyze_bug(bug, url, tree_context):
    prompt = f"""
-You are analyzing bugs for the google-gemini/gemini-cli codebase.
-Here is the directory structure of the 'packages' directory:
+You are analyzing bugs for the current codebase.
+Here is the directory structure of the project:
 {tree_context[:4000]}

 Analyze the following GitHub bug report to determine the implementation effort.
@@ -57,7 +49,7 @@ Reply with ONLY a valid JSON object matching exactly this schema, without Markdo
        }
    }
    
-    req = urllib.request.Request(URL, data=json.dumps(data).encode('utf-8'), headers={'Content-Type': 'application/json'})
+    req = urllib.request.Request(url, data=json.dumps(data).encode('utf-8'), headers={'Content-Type': 'application/json'})
    try:
        with urllib.request.urlopen(req) as response:
            result = json.loads(response.read().decode('utf-8'))
@@ -74,12 +66,13 @@ Reply with ONLY a valid JSON object matching exactly this schema, without Markdo
            parsed = json.loads(text.strip())
            return parsed
    except Exception as e:
-        print(f"Error processing bug {bug['number']}: {e}")
+        print(f"Error processing bug {bug.get('number', 'unknown')}: {e}")
        return {"analysis": "Failed to analyze", "effort_level": "medium", "reasoning": "Error calling Gemini API"}

-def process_bug(bug):
-    print(f"Analyzing Bug #{bug['number']}...")
-    result = analyze_bug(bug)
+def process_bug_task(args):
+    bug, url, tree_context = args
+    print(f"Analyzing Bug #{bug.get('number', 'unknown')}...")
+    result = analyze_bug(bug, url, tree_context)
    bug['analysis'] = result.get('analysis', '')
    bug['effort_level'] = result.get('effort_level', 'medium')
    bug['reasoning'] = result.get('reasoning', '')
@@ -88,16 +81,31 @@ def process_bug(bug):
    return bug

 def main():
-    print(f"Starting analysis of {len(bugs)} bugs...")
+    parser = argparse.ArgumentParser(description="Static initial triage analyzer for bugs.")
+    parser.add_argument("--api-key", required=True, help="Gemini API Key")
+    parser.add_argument("--input", default="data/bugs.json", help="Input JSON file containing bugs")
+    parser.add_argument("--project", default="../../packages", help="Project root to analyze")
+    args = parser.parse_args()
+
+    url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={args.api_key}"
+
+    with open(args.input, 'r') as f:
+        bugs = json.load(f)
+
+    tree_context = get_tree(args.project)
+
+    print(f"Starting static analysis of {len(bugs)} bugs...")
    
    # Process in batches to save incrementally
    batch_size = 10
    for i in range(0, len(bugs), batch_size):
        batch = bugs[i:i+batch_size]
-        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
-            executor.map(process_bug, batch)
+        tasks = [(bug, url, tree_context) for bug in batch]
        
-        with open(BUGS_FILE, 'w') as f:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+            list(executor.map(process_bug_task, tasks))
+        
+        with open(args.input, 'w') as f:
            json.dump(bugs, f, indent=2)
        print(f"Saved batch {i//batch_size + 1}")
        
@@ -2,27 +2,21 @@ import json
 import urllib.request
 import urllib.error
 import os
+import argparse
 import concurrent.futures
 import subprocess
 import sys
 import threading

-API_KEY = "REDACTED_API_KEY"
 MODEL = "gemini-3-flash-preview"
-URL = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={API_KEY}"
-
-BUGS_FILE = 'data/bugs.json'
 file_lock = threading.Lock()

-with open(BUGS_FILE, 'r') as f:
-    bugs = json.load(f)
-
-tools = [
+tools_decl = [
    {
        "functionDeclarations": [
            {
                "name": "search_codebase",
-                "description": "Search the gemini-cli packages directory for a string using grep. Returns matching lines and file paths.",
+                "description": "Search the project directory for a string using grep. Returns matching lines and file paths.",
                "parameters": {
                    "type": "OBJECT",
                    "properties": {
@@ -46,17 +40,17 @@ tools = [
    }
 ]

-def call_gemini(messages):
+def call_gemini(messages, url):
    data = {
        "contents": messages,
-        "tools": tools,
+        "tools": tools_decl,
        "generationConfig": {"temperature": 0.1}
    }
-    req = urllib.request.Request(URL, data=json.dumps(data).encode('utf-8'), headers={'Content-Type': 'application/json'})
+    req = urllib.request.Request(url, data=json.dumps(data).encode('utf-8'), headers={'Content-Type': 'application/json'})
    with urllib.request.urlopen(req) as response:
        return json.loads(response.read().decode('utf-8'))

-def execute_tool(call):
+def execute_tool(call, project_path):
    name = call['name']
    args = call.get('args', {})
    
@@ -64,7 +58,7 @@ def execute_tool(call):
        pattern = args.get('pattern', '')
        pattern = pattern.replace('"', '\\"')
        try:
-            cmd = f'grep -rn "{pattern}" ../../packages | grep -vE "node_modules|dist|build|\\.test\\." | head -n 20'
+            cmd = f'grep -rn "{pattern}" "{project_path}" | grep -vE "node_modules|dist|build|\\.test\\." | head -n 20'
            res = subprocess.check_output(cmd, shell=True, text=True, stderr=subprocess.STDOUT)
            return res if res else "No matches found."
        except subprocess.CalledProcessError as e:
@@ -72,12 +66,12 @@ def execute_tool(call):
    elif name == 'read_file':
        filepath = args.get('filepath', '')
        if not filepath.startswith('/'):
-            filepath = os.path.join('../../packages', filepath)
+            filepath = os.path.join(project_path, filepath)
        
        try:
            if not os.path.exists(filepath):
                basename = os.path.basename(filepath)
-                find_cmd = f'find ../../packages -name "{basename}" | head -n 1'
+                find_cmd = f'find "{project_path}" -name "{basename}" | head -n 1'
                found_path = subprocess.check_output(find_cmd, shell=True, text=True).strip()
                if found_path: filepath = found_path
                else: return f"File {filepath} not found."
@@ -89,8 +83,8 @@ def execute_tool(call):
            return str(e)
    return "Unknown tool"

-def analyze_issue(issue):
-    system_instruction = """You are a senior software engineer analyzing bug reports for the gemini-cli codebase. 
+def analyze_issue(issue, url, project_path):
+    system_instruction = """You are a senior software engineer analyzing bug reports. 
 You MUST use the provided tools to investigate the codebase and pinpoint exactly which files and logic are responsible for the bug. 
 DO NOT GUESS.

@@ -114,9 +108,9 @@ Output format (ONLY valid JSON, NO markdown):
    prompt = f"{system_instruction}\n\nBug Title: {issue.get('title')}\nBug Body: {issue.get('body', '')[:1200]}"
    messages = [{"role": "user", "parts": [{"text": prompt}]}]
    
-    for turn in range(30): # Significantly higher turn limit
+    for turn in range(30):
        try:
-            res = call_gemini(messages)
+            res = call_gemini(messages, url)
            candidate = res['candidates'][0]['content']
            parts = candidate.get('parts', [])
            
@@ -129,7 +123,7 @@ Output format (ONLY valid JSON, NO markdown):
                tool_responses = []
                for fcall in function_calls:
                    call_data = fcall['functionCall']
-                    result = execute_tool(call_data)
+                    result = execute_tool(call_data, project_path)
                    tool_responses.append({
                        "functionResponse": {
                            "name": call_data['name'],
@@ -146,14 +140,14 @@ Output format (ONLY valid JSON, NO markdown):
            
    return {"analysis": "Failed to analyze autonomously", "effort_level": "medium", "reasoning": "Agent loop exceeded 30 turns or errored."}

-def process_issue(issue):
-    # Re-analyze if empty, failed, or just a placeholder
+def process_issue_task(args_tuple):
+    issue, url, project_path, input_file, bugs = args_tuple
    current_analysis = issue.get('analysis', '')
    if current_analysis and current_analysis != "Failed to analyze autonomously" and len(current_analysis) > 50:
        return issue
        
-    print(f"Analyzing Bug #{issue['number']}...", flush=True)
-    result = analyze_issue(issue)
+    print(f"Analyzing Bug #{issue.get('number', 'unknown')}...", flush=True)
+    result = analyze_issue(issue, url, project_path)
    
    issue['analysis'] = result.get('analysis', 'Failed to analyze')
    issue['effort_level'] = result.get('effort_level', 'medium')
@@ -163,18 +157,32 @@ def process_issue(issue):
    else:
        issue.pop('recommended_implementation', None)
        
-    print(f"Completed Bug #{issue['number']} -> {issue['effort_level']}", flush=True)
+    print(f"Completed Bug #{issue.get('number', 'unknown')} -> {issue.get('effort_level', 'unknown')}", flush=True)
    
    with file_lock:
-        with open(BUGS_FILE, 'w') as f:
+        with open(input_file, 'w') as f:
            json.dump(bugs, f, indent=2)
    return issue

 def main():
+    parser = argparse.ArgumentParser(description="Deep agentic bug analyzer.")
+    parser.add_argument("--api-key", required=True, help="Gemini API Key")
+    parser.add_argument("--input", default="data/bugs.json", help="Input JSON file containing bugs")
+    parser.add_argument("--project", default="../../packages", help="Project root to analyze")
+    args = parser.parse_args()
+
+    url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={args.api_key}"
+
+    with open(args.input, 'r') as f:
+        bugs = json.load(f)
+
    print(f"Starting FINAL RE-ANALYSIS for {len(bugs)} bugs (Turn Limit: 30)...", flush=True)
+    
+    tasks = [(b, url, args.project, args.input, bugs) for b in bugs]
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
-        list(executor.map(process_issue, bugs))
-    print("Agentic analysis complete. `bugs.json` is updated.", flush=True)
+        list(executor.map(process_issue_task, tasks))
+        
+    print("Agentic analysis complete. JSON is updated.", flush=True)

 if __name__ == '__main__':
    main()
@@ -2,6 +2,12 @@
 # Run from the project root or the scripts/backlog-analysis directory
 # This script assumes it's running in the same directory as the python scripts

+if [ -z "$GEMINI_API_KEY" ]; then
+  echo "Error: GEMINI_API_KEY environment variable is required."
+  echo "Usage: GEMINI_API_KEY=your_key ./loop_analyzer.sh"
+  exit 1
+fi
+
 while true; do
  count=$(jq '[.[] | select(.analysis == "Failed to analyze autonomously" or .analysis == null or .analysis == "" or (.analysis | length) < 30)] | length' data/bugs.json)
  if [ "$count" -eq 0 ]; then
@@ -9,6 +15,6 @@ while true; do
    break
  fi
  echo "Remaining bugs: $count"
-  python3 single_turn_bug_analyzer.py
+  python3 single_turn_bug_analyzer.py --api-key "$GEMINI_API_KEY"
 done
 python3 generate_bugs_csv.py
@@ -3,27 +3,24 @@ import urllib.request
 import os
 import subprocess
 import re
+import argparse
 import concurrent.futures
+import threading

-API_KEY = "REDACTED_API_KEY"
 MODEL = "gemini-3-flash-preview"
-URL = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={API_KEY}"
-BUGS_FILE = 'data/bugs.json'
-
-with open(BUGS_FILE, 'r') as f:
-    bugs = json.load(f)
+file_lock = threading.Lock()

 def extract_keywords(text):
    words = re.findall(r'\b[A-Z][a-zA-Z0-9]+\b|\b\w+\.tsx?\b|\b\w+Service\b|\b\w+Command\b', text)
    words = list(set([w for w in words if len(w) > 4]))
    return words[:8]

-def search_codebase(keywords):
+def search_codebase(keywords, project_path):
    context = ""
    for kw in keywords:
        try:
            kw_clean = kw.replace('"', '\\"')
-            cmd = f'grep -rn "{kw_clean}" ../../packages | grep -vE "node_modules|dist|build|\\.test\\." | head -n 8'
+            cmd = f'grep -rn "{kw_clean}" "{project_path}" | grep -vE "node_modules|dist|build|\\.test\\." | head -n 8'
            out = subprocess.check_output(cmd, shell=True, text=True, stderr=subprocess.STDOUT)
            if out:
                context += f"\n--- Matches for {kw_clean} ---\n{out}\n"
@@ -31,7 +28,9 @@ def search_codebase(keywords):
            pass
    return context

-def process_issue(issue):
+def process_issue_task(args_tuple):
+    issue, url, project_path, input_file, bugs = args_tuple
+    
    if issue.get('analysis') and issue['analysis'] != "Failed to analyze autonomously" and len(issue['analysis']) > 30:
        return issue

@@ -39,9 +38,9 @@ def process_issue(issue):
    body = issue.get('body', '')[:1500]
    
    keywords = extract_keywords(title + " " + body)
-    code_context = search_codebase(keywords)
+    code_context = search_codebase(keywords, project_path)

-    prompt = f"""You are a senior software engineer analyzing bug reports for the gemini-cli codebase. 
+    prompt = f"""You are a senior software engineer analyzing bug reports. 
 Based on the bug description and the provided codebase search context, pinpoint exactly which files and logic are responsible for the bug. 
 DO NOT GUESS. If the context isn't enough, provide your best technical hypothesis.

@@ -69,7 +68,7 @@ Output ONLY valid JSON (no markdown block):
    }
    
    try:
-        req = urllib.request.Request(URL, data=json.dumps(data).encode('utf-8'), headers={'Content-Type': 'application/json'})
+        req = urllib.request.Request(url, data=json.dumps(data).encode('utf-8'), headers={'Content-Type': 'application/json'})
        with urllib.request.urlopen(req, timeout=60) as response:
            res = json.loads(response.read().decode('utf-8'))
            txt = res['candidates'][0]['content']['parts'][0]['text']
@@ -79,27 +78,38 @@ Output ONLY valid JSON (no markdown block):
            issue['analysis'] = parsed.get('analysis', 'Failed to analyze')
            issue['effort_level'] = parsed.get('effort_level', 'medium')
            issue['reasoning'] = parsed.get('reasoning', 'Could not determine')
-            print(f"Completed {issue['number']} -> {issue['effort_level']}", flush=True)
+            print(f"Completed {issue.get('number', 'unknown')} -> {issue['effort_level']}", flush=True)
    except Exception as e:
-        print(f"Failed {issue['number']}: {e}", flush=True)
+        print(f"Failed {issue.get('number', 'unknown')}: {e}", flush=True)
        
+    with file_lock:
+        with open(input_file, 'w') as f:
+            json.dump(bugs, f, indent=2)
+            
    return issue

 def main():
+    parser = argparse.ArgumentParser(description="Single turn code search bug analyzer.")
+    parser.add_argument("--api-key", required=True, help="Gemini API Key")
+    parser.add_argument("--input", default="data/bugs.json", help="Input JSON file containing bugs")
+    parser.add_argument("--project", default="../../packages", help="Project root to analyze")
+    args = parser.parse_args()
+
+    url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={args.api_key}"
+
+    with open(args.input, 'r') as f:
+        bugs = json.load(f)
+
    to_analyze = [b for b in bugs if b.get('analysis') == "Failed to analyze autonomously" or not b.get('analysis') or len(b.get('analysis', '')) < 30]
-    
-    # Only process 5 at a time
    to_analyze = to_analyze[:5]
+    
    print(f"Starting single-turn analysis for {len(to_analyze)} bugs...", flush=True)
    
+    tasks = [(b, url, args.project, args.input, bugs) for b in to_analyze]
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
-        list(executor.map(process_issue, to_analyze))
-    
-    # Final save
-    with open(BUGS_FILE, 'w') as f:
-        json.dump(bugs, f, indent=2)
+        list(executor.map(process_issue_task, tasks))
        
-    print("Done processing 5 bugs.", flush=True)
+    print("Done processing batch.", flush=True)

 if __name__ == '__main__':
    main()