feat: add 'review-pr' skill and sunset 'async-pr-review'

2026-06-01 22:52:43 -07:00 · 2026-03-13 19:39:41 -07:00
parent 8692b347f0
commit 295831105b
9 changed files with 156 additions and 502 deletions
@@ -1,45 +0,0 @@
---
-name: async-pr-review
-description: Trigger this skill when the user wants to start an asynchronous PR review, run background checks on a PR, or check the status of a previously started async PR review.
---
-
-# Async PR Review
-
-This skill provides a set of tools to asynchronously review a Pull Request. It will create a background job to run the project's preflight checks, execute Gemini-powered test plans, and perform a comprehensive code review using custom prompts.
-
-This skill is designed to showcase an advanced "Agentic Asynchronous Pattern":
-1.  **Native Background Shells vs Headless Inference**: While Gemini CLI can natively spawn and detach background shell commands (using the `run_shell_command` tool with `is_background: true`), a standard bash background job cannot perform LLM inference. To conduct AI-driven code reviews and test generation in the background, the shell script *must* invoke the `gemini` executable headlessly using `-p`. This offloads the AI tasks to independent worker agents.
-2.  **Dynamic Git Scoping**: The review scripts avoid hardcoded paths. They use `git rev-parse --show-toplevel` to automatically resolve the root of the user's current project.
-3.  **Ephemeral Worktrees**: Instead of checking out branches in the user's main workspace, the skill provisions temporary git worktrees in `.gemini/tmp/async-reviews/pr-<number>`. This prevents git lock conflicts and namespace pollution.
-4.  **Agentic Evaluation (`check-async-review.sh`)**: The check script outputs clean JSON/text statuses for the main agent to parse. The interactive agent itself synthesizes the final assessment dynamically from the generated log files.
-
-## Workflow
-
-1.  **Determine Action**: Establish whether the user wants to start a new async review or check the status of an existing one.
-    *   If the user says "start an async review for PR #123" or similar, proceed to **Start Review**.
-    *   If the user says "check the status of my async review for PR #123" or similar, proceed to **Check Status**.
-
-### Start Review
-
-If the user wants to start a new async PR review:
-
-1.  Ask the user for the PR number if they haven't provided it.
-2.  Execute the `async-review.sh` script, passing the PR number as the first argument. Be sure to run it with the `is_background` flag set to true to ensure it immediately detaches.
-    ```bash
-    .gemini/skills/async-pr-review/scripts/async-review.sh <PR_NUMBER>
-    ```
-3.  Inform the user that the tasks have started successfully and they can check the status later.
-
-### Check Status
-
-If the user wants to check the status or view the final assessment of a previously started async review:
-
-1.  Ask the user for the PR number if they haven't provided it.
-2.  Execute the `check-async-review.sh` script, passing the PR number as the first argument:
-    ```bash
-    .gemini/skills/async-pr-review/scripts/check-async-review.sh <PR_NUMBER>
-    ```
-3.  **Evaluate Output**: Read the output from the script.
-    *   If the output contains `STATUS: IN_PROGRESS`, tell the user which tasks are still running.
-    *   If the output contains `STATUS: COMPLETE`, use your file reading tools (`read_file`) to retrieve the contents of `final-assessment.md`, `review.md`, `pr-diff.diff`, `npm-test.log`, and `test-execution.log` files from the `LOG_DIR` specified in the output.
-    *   **Final Assessment**: Read those files, synthesize their results, and give the user a concise recommendation on whether the PR builds successfully, passes tests, and if you recommend they approve it based on the review.
@@ -1,148 +0,0 @@
-# --- CORE TOOLS ---
-[[rule]]
-toolName = "read_file"
-decision = "allow"
-priority = 100
-
-[[rule]]
-toolName = "write_file"
-decision = "allow"
-priority = 100
-
-[[rule]]
-toolName = "grep_search"
-decision = "allow"
-priority = 100
-
-[[rule]]
-toolName = "glob"
-decision = "allow"
-priority = 100
-
-[[rule]]
-toolName = "list_directory"
-decision = "allow"
-priority = 100
-
-[[rule]]
-toolName = "codebase_investigator"
-decision = "allow"
-priority = 100
-
-# --- SHELL COMMANDS ---
-
-# Git (Safe/Read-only)
-[[rule]]
-toolName = "run_shell_command"
-commandPrefix = [
-  "git blame",
-  "git show",
-  "git grep",
-  "git show-ref",
-  "git ls-tree",
-  "git ls-remote",
-  "git reflog",
-  "git remote -v",
-  "git diff",
-  "git rev-list",
-  "git rev-parse",
-  "git merge-base",
-  "git cherry",
-  "git fetch",
-  "git status",
-  "git st",
-  "git branch",
-  "git br",
-  "git log",
-  "git --version"
-]
-decision = "allow"
-priority = 100
-
-# GitHub CLI (Read-only)
-[[rule]]
-toolName = "run_shell_command"
-commandPrefix = [
-  "gh workflow list",
-  "gh auth status",
-  "gh checkout view",
-  "gh run view",
-  "gh run job view",
-  "gh run list",
-  "gh run --help",
-  "gh issue view",
-  "gh issue list",
-  "gh label list",
-  "gh pr diff",
-  "gh pr check",
-  "gh pr checks",
-  "gh pr view",
-  "gh pr list",
-  "gh pr status",
-  "gh repo view",
-  "gh job view",
-  "gh api",
-  "gh log"
-]
-decision = "allow"
-priority = 100
-
-# Node.js/NPM (Generic Tests, Checks, and Build)
-[[rule]]
-toolName = "run_shell_command"
-commandPrefix = [
-  "npm run start",
-  "npm install",
-  "npm run",
-  "npm test",
-  "npm ci",
-  "npm list",
-  "npm --version"
-]
-decision = "allow"
-priority = 100
-
-# Core Utilities (Safe)
-[[rule]]
-toolName = "run_shell_command"
-commandPrefix = [
-  "sleep",
-  "env",
-  "break",
-  "xargs",
-  "base64",
-  "uniq",
-  "sort",
-  "echo",
-  "which",
-  "ls",
-  "find",
-  "tail",
-  "head",
-  "cat",
-  "cd",
-  "grep",
-  "ps",
-  "pwd",
-  "wc",
-  "file",
-  "stat",
-  "diff",
-  "lsof",
-  "date",
-  "whoami",
-  "uname",
-  "du",
-  "cut",
-  "true",
-  "false",
-  "readlink",
-  "awk",
-  "jq",
-  "rg",
-  "less",
-  "more",
-  "tree"
-]
-decision = "allow"
-priority = 100
@@ -1,241 +0,0 @@
-#!/bin/bash
-
-notify() {
-  local title="$1"
-  local message="$2"
-  local pr="$3"
-  # Terminal escape sequence
-  printf "\e]9;%s | PR #%s | %s\a" "$title" "$pr" "$message"
-  # Native macOS notification
-  if [[ "$(uname)" == "Darwin" ]]; then
-    osascript -e "display notification \"$message\" with title \"$title\" subtitle \"PR #$pr\""
-  fi
-}
-
-pr_number=$1
-if [[ -z "$pr_number" ]]; then
-  echo "Usage: async-review <pr_number>"
-  exit 1
-fi
-
-base_dir=$(git rev-parse --show-toplevel 2>/dev/null)
-if [[ -z "$base_dir" ]]; then
-  echo "❌ Must be run from within a git repository."
-  exit 1
-fi
-
-# Use the repository's local .gemini/tmp directory for ephemeral worktrees and logs
-pr_dir="$base_dir/.gemini/tmp/async-reviews/pr-$pr_number"
-target_dir="$pr_dir/worktree"
-log_dir="$pr_dir/logs"
-
-cd "$base_dir" || exit 1
-
-mkdir -p "$log_dir"
-rm -f "$log_dir/setup.exit" "$log_dir/final-assessment.exit" "$log_dir/final-assessment.md"
-
-echo "🧹 Cleaning up previous worktree if it exists..." | tee -a "$log_dir/setup.log"
-git worktree remove -f "$target_dir" >> "$log_dir/setup.log" 2>&1 || true
-git branch -D "gemini-async-pr-$pr_number" >> "$log_dir/setup.log" 2>&1 || true
-git worktree prune >> "$log_dir/setup.log" 2>&1 || true
-
-echo "📡 Fetching PR #$pr_number..." | tee -a "$log_dir/setup.log"
-if ! git fetch origin -f "pull/$pr_number/head:gemini-async-pr-$pr_number" >> "$log_dir/setup.log" 2>&1; then
-  echo 1 > "$log_dir/setup.exit"
-  echo "❌ Fetch failed. Check $log_dir/setup.log"
-  notify "Async Review Failed" "Fetch failed." "$pr_number"
-  exit 1
-fi
-
-if [[ ! -d "$target_dir" ]]; then
-  echo "🧹 Pruning missing worktrees..." | tee -a "$log_dir/setup.log"
-  git worktree prune >> "$log_dir/setup.log" 2>&1
-  echo "🌿 Creating worktree in $target_dir..." | tee -a "$log_dir/setup.log"
-  if ! git worktree add "$target_dir" "gemini-async-pr-$pr_number" >> "$log_dir/setup.log" 2>&1; then
-    echo 1 > "$log_dir/setup.exit"
-    echo "❌ Worktree creation failed. Check $log_dir/setup.log"
-    notify "Async Review Failed" "Worktree creation failed." "$pr_number"
-    exit 1
-  fi
-else
-  echo "🌿 Worktree already exists." | tee -a "$log_dir/setup.log"
-fi
-echo 0 > "$log_dir/setup.exit"
-
-cd "$target_dir" || exit 1
-
-echo "🚀 Launching background tasks. Logs saving to: $log_dir"
-
-echo "  ↳ [1/5] Grabbing PR diff..."
-rm -f "$log_dir/pr-diff.exit"
-{ gh pr diff "$pr_number" > "$log_dir/pr-diff.diff" 2>&1; echo $? > "$log_dir/pr-diff.exit"; } &
-
-echo "  ↳ [2/5] Starting build and lint..."
-rm -f "$log_dir/build-and-lint.exit"
-{ { npm run clean && npm ci && npm run format && npm run build && npm run lint:ci && npm run typecheck; } > "$log_dir/build-and-lint.log" 2>&1; echo $? > "$log_dir/build-and-lint.exit"; } &
-
-# Dynamically resolve gemini binary (fallback to your nightly path)
-GEMINI_CMD=$(which gemini || echo "$HOME/.gcli/nightly/node_modules/.bin/gemini")
-POLICY_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)/policy.toml"
-
-echo "  ↳ [3/5] Starting Gemini code review..."
-rm -f "$log_dir/review.exit"
-{ "$GEMINI_CMD" --policy "$POLICY_PATH" -p "/review-frontend $pr_number" > "$log_dir/review.md" 2>&1; echo $? > "$log_dir/review.exit"; } &
-
-echo "  ↳ [4/5] Starting automated tests (waiting for build and lint)..."
-rm -f "$log_dir/npm-test.exit"
-{ 
-  while [ ! -f "$log_dir/build-and-lint.exit" ]; do sleep 1; done
-  if [ "$(cat "$log_dir/build-and-lint.exit")" == "0" ]; then
-    gh pr checks "$pr_number" > "$log_dir/ci-checks.log" 2>&1
-    ci_status=$?
-    
-    if [ "$ci_status" -eq 0 ]; then
-      echo "CI checks passed. Skipping local npm tests." > "$log_dir/npm-test.log"
-      echo 0 > "$log_dir/npm-test.exit"
-    elif [ "$ci_status" -eq 8 ]; then
-      echo "CI checks are still pending. Skipping local npm tests to avoid duplicate work. Please check GitHub for final results." > "$log_dir/npm-test.log"
-      echo 0 > "$log_dir/npm-test.exit"
-    else
-      echo "CI checks failed. Failing checks:" > "$log_dir/npm-test.log"
-      gh pr checks "$pr_number" --json name,bucket -q '.[] | select(.bucket=="fail") | .name' >> "$log_dir/npm-test.log" 2>&1
-      
-      echo "Attempting to extract failing test files from CI logs..." >> "$log_dir/npm-test.log"
-      pr_branch=$(gh pr view "$pr_number" --json headRefName -q '.headRefName' 2>/dev/null)
-      run_id=$(gh run list --branch "$pr_branch" --workflow ci.yml --json databaseId -q '.[0].databaseId' 2>/dev/null)
-      
-      failed_files=""
-      if [[ -n "$run_id" ]]; then
-        failed_files=$(gh run view "$run_id" --log-failed 2>/dev/null | grep -o -E '(packages/[a-zA-Z0-9_-]+|integration-tests|evals)/[a-zA-Z0-9_/-]+\.test\.ts(x)?' | sort | uniq)
-      fi
-      
-      if [[ -n "$failed_files" ]]; then
-        echo "Found failing test files from CI:" >> "$log_dir/npm-test.log"
-        for f in $failed_files; do echo "  - $f" >> "$log_dir/npm-test.log"; done
-        echo "Running ONLY failing tests locally..." >> "$log_dir/npm-test.log"
-        
-        exit_code=0
-        for file in $failed_files; do
-           if [[ "$file" == packages/* ]]; then
-             ws_dir=$(echo "$file" | cut -d'/' -f1,2)
-           else
-             ws_dir=$(echo "$file" | cut -d'/' -f1)
-           fi
-           rel_file=${file#$ws_dir/}
-           
-           echo "--- Running $rel_file in workspace $ws_dir ---" >> "$log_dir/npm-test.log"
-           if ! npm run test:ci -w "$ws_dir" -- "$rel_file" >> "$log_dir/npm-test.log" 2>&1; then
-             exit_code=1
-           fi
-        done
-        echo $exit_code > "$log_dir/npm-test.exit"
-      else
-        echo "Could not extract specific failing files. Skipping full local test suite as it takes too long. Please check CI logs manually." >> "$log_dir/npm-test.log"
-        echo 1 > "$log_dir/npm-test.exit"
-      fi
-    fi
-  else
-    echo "Skipped due to build-and-lint failure" > "$log_dir/npm-test.log"
-    echo 1 > "$log_dir/npm-test.exit"
-  fi
-} &
-
-echo "  ↳ [5/5] Starting Gemini test execution (waiting for build and lint)..."
-rm -f "$log_dir/test-execution.exit"
-{ 
-  while [ ! -f "$log_dir/build-and-lint.exit" ]; do sleep 1; done
-  if [ "$(cat "$log_dir/build-and-lint.exit")" == "0" ]; then
-    "$GEMINI_CMD" --policy "$POLICY_PATH" -p "Analyze the diff for PR $pr_number using 'gh pr diff $pr_number'. Instead of running the project's automated test suite (like 'npm test'), physically exercise the newly changed code in the terminal (e.g., by writing a temporary script to call the new functions, or testing the CLI command directly). Verify the feature's behavior works as expected. IMPORTANT: Do NOT modify any source code to fix errors. Just exercise the code and log the results, reporting any failures clearly. Do not ask for user confirmation." > "$log_dir/test-execution.log" 2>&1; echo $? > "$log_dir/test-execution.exit"
-  else
-    echo "Skipped due to build-and-lint failure" > "$log_dir/test-execution.log"
-    echo 1 > "$log_dir/test-execution.exit"
-  fi
-} &
-
-echo "✅ All tasks dispatched!"
-echo "You can monitor progress with: tail -f $log_dir/*.log"
-echo "Read your review later at: $log_dir/review.md"
-
-# Polling loop to wait for all background tasks to finish
-tasks=("pr-diff" "build-and-lint" "review" "npm-test" "test-execution")
-log_files=("pr-diff.diff" "build-and-lint.log" "review.md" "npm-test.log" "test-execution.log")
-
-declare -A task_done
-for t in "${tasks[@]}"; do task_done[$t]=0; done
-
-all_done=0
-while [[ $all_done -eq 0 ]]; do
-  clear
-  echo "=================================================="
-  echo "🚀 Async PR Review Status for PR #$pr_number"
-  echo "=================================================="
-  echo ""
-  
-  all_done=1
-  for i in "${!tasks[@]}"; do
-    t="${tasks[$i]}"
-    
-    if [[ -f "$log_dir/$t.exit" ]]; then
-      exit_code=$(cat "$log_dir/$t.exit")
-      if [[ "$exit_code" == "0" ]]; then
-        echo "  ✅ $t: SUCCESS"
-      else
-        echo "  ❌ $t: FAILED (exit code $exit_code)"
-      fi
-      task_done[$t]=1
-    else
-      echo "  ⏳ $t: RUNNING"
-      all_done=0
-    fi
-  done
-  
-  echo ""
-  echo "=================================================="
-  echo "📝 Live Logs (Last 5 lines of running tasks)"
-  echo "=================================================="
-  
-  for i in "${!tasks[@]}"; do
-    t="${tasks[$i]}"
-    log_file="${log_files[$i]}"
-    
-    if [[ ${task_done[$t]} -eq 0 ]]; then
-      if [[ -f "$log_dir/$log_file" ]]; then
-        echo ""
-        echo "--- $t ---"
-        tail -n 5 "$log_dir/$log_file"
-      fi
-    fi
-  done
-  
-  if [[ $all_done -eq 0 ]]; then
-    sleep 3
-  fi
-done
-
-clear
-echo "=================================================="
-echo "🚀 Async PR Review Status for PR #$pr_number"
-echo "=================================================="
-echo ""
-for t in "${tasks[@]}"; do
-  exit_code=$(cat "$log_dir/$t.exit")
-  if [[ "$exit_code" == "0" ]]; then
-    echo "  ✅ $t: SUCCESS"
-  else
-    echo "  ❌ $t: FAILED (exit code $exit_code)"
-  fi
-done
-echo ""
-
-echo "⏳ Tasks complete! Synthesizing final assessment..."
-if ! "$GEMINI_CMD" --policy "$POLICY_PATH" -p "Read the review at $log_dir/review.md, the automated test logs at $log_dir/npm-test.log, and the manual test execution logs at $log_dir/test-execution.log. Summarize the results, state whether the build and tests passed based on $log_dir/build-and-lint.exit and $log_dir/npm-test.exit, and give a final recommendation for PR $pr_number." > "$log_dir/final-assessment.md" 2>&1; then
-  echo $? > "$log_dir/final-assessment.exit"
-  echo "❌ Final assessment synthesis failed!"
-  echo "Check $log_dir/final-assessment.md for details."
-  notify "Async Review Failed" "Final assessment synthesis failed." "$pr_number"
-  exit 1
-fi
-
-echo 0 > "$log_dir/final-assessment.exit"
-echo "✅ Final assessment complete! Check $log_dir/final-assessment.md"
-notify "Async Review Complete" "Review and test execution finished successfully." "$pr_number"
@@ -1,65 +0,0 @@
-#!/bin/bash
-pr_number=$1
-
-if [[ -z "$pr_number" ]]; then
-  echo "Usage: check-async-review <pr_number>"
-  exit 1
-fi
-
-base_dir=$(git rev-parse --show-toplevel 2>/dev/null)
-if [[ -z "$base_dir" ]]; then
-  echo "❌ Must be run from within a git repository."
-  exit 1
-fi
-
-log_dir="$base_dir/.gemini/tmp/async-reviews/pr-$pr_number/logs"
-
-if [[ ! -d "$log_dir" ]]; then
-  echo "STATUS: NOT_FOUND"
-  echo "❌ No logs found for PR #$pr_number in $log_dir"
-  exit 0
-fi
-
-tasks=(
-  "setup|setup.log"
-  "pr-diff|pr-diff.diff"
-  "build-and-lint|build-and-lint.log"
-  "review|review.md"
-  "npm-test|npm-test.log"
-  "test-execution|test-execution.log"
-  "final-assessment|final-assessment.md"
-)
-
-all_done=true
-echo "STATUS: CHECKING"
-
-for task_info in "${tasks[@]}"; do
-  IFS="|" read -r task_name log_file <<< "$task_info"
-  
-  file_path="$log_dir/$log_file"
-  exit_file="$log_dir/$task_name.exit"
-
-  if [[ -f "$exit_file" ]]; then
-    exit_code=$(cat "$exit_file")
-    if [[ "$exit_code" == "0" ]]; then
-      echo "✅ $task_name: SUCCESS"
-    else
-      echo "❌ $task_name: FAILED (exit code $exit_code)"
-      echo "   Last lines of $file_path:"
-      tail -n 3 "$file_path" | sed 's/^/      /'
-    fi
-  elif [[ -f "$file_path" ]]; then
-    echo "⏳ $task_name: RUNNING"
-    all_done=false
-  else
-    echo "➖ $task_name: NOT STARTED"
-    all_done=false
-  fi
-done
-
-if $all_done; then
-  echo "STATUS: COMPLETE"
-  echo "LOG_DIR: $log_dir"
-else
-  echo "STATUS: IN_PROGRESS"
-fi
@@ -74,3 +74,16 @@ The orchestration logic for this skill is fully tested. To run the tests:
 npx vitest .gemini/skills/offload/tests/orchestration.test.ts
 ```
 These tests mock the external environment (SSH, GitHub CLI, and the file system) to ensure that the orchestration scripts generate the correct commands and handle environment isolation accurately.
+
+## Future roadmap: Offload V2 (Fleet)
+
+The current version of `offload` focuses on a persistent SSH workstation. The long-term vision for this tool includes:
+
+-   **Elastic compute**: Moving from a single machine to a fleet of ephemeral 
+    remote containers (for example, using Cloud Run Jobs or GKE Pods).
+-   **Immutable base images**: Building a Docker container for every commit to
+    `main` to ensure a guaranteed, pristine verification environment.
+-   **Stateless execution**: Eliminating the need for manual cleanup by using
+    on-demand, disposable compute nodes for every offload task.
+-   **Massive parallelism**: Enabling dozens of simultaneous PR reviews and 
+    implementations across the elastic fleet.
@@ -29,12 +29,35 @@ toolName = "codebase_investigator"
 decision = "allow"
 priority = 100

+# --- SKILLS ---
+[[rule]]
+toolName = "activate_skill"
+decision = "allow"
+priority = 100
+[rule.toolParameters]
+name = "review-pr"
+
+[[rule]]
+toolName = "activate_skill"
+decision = "allow"
+priority = 100
+[rule.toolParameters]
+name = "fix-pr"
+
+[[rule]]
+toolName = "activate_skill"
+decision = "allow"
+priority = 100
+[rule.toolParameters]
+name = "pr-address-comments"
+
 # --- SHELL COMMANDS ---

-# Git (Safe/Read-only)
+# Git (Safe/Read-only + Local State)
 [[rule]]
 toolName = "run_shell_command"
 commandPrefix = [
+  "git checkout",
  "git blame",
  "git show",
  "git grep",
@@ -10,8 +10,7 @@ export async function runReviewPlaybook(prNumber: string, targetDir: string, pol
  runner.register([
    { id: 'build', name: 'Fast Build', cmd: `cd ${targetDir} && npm ci && npm run build` },
    { id: 'ci', name: 'CI Checks', cmd: `gh pr checks ${prNumber}` },
-    { id: 'review', name: 'Gemini Analysis', cmd: `${geminiBin} --policy ${policyPath} --cwd ${targetDir} -p "/review-frontend ${prNumber}"` },
-    { id: 'verify', name: 'Behavioral Proof', cmd: `${geminiBin} --policy ${policyPath} --cwd ${targetDir} -p "Analyze the code in ${targetDir} and exercise it to prove it works."`, dep: 'build' }
+    { id: 'review', name: 'Offloaded Review', cmd: `${geminiBin} --policy ${policyPath} --cwd ${targetDir} -p "Please activate the 'review-pr' skill and use it to conduct a behavioral review of PR #${prNumber}."` }
  ]);

  return runner.run();
@@ -0,0 +1,59 @@
+# Review PR Skill
+
+This skill enables the agent to conduct a high-fidelity, behavioral code review
+for a Pull Request. It is designed to be run in a dedicated, offloaded remote
+session where parallel infrastructure validation (build, CI check) is already 
+underway.
+
+## Objective
+
+Conduct a thorough review that goes beyond static analysis. The agent must
+**physically prove** the feature works and that all regressions are caught by 
+exercising the code in a live terminal.
+
+## Workflow: Verify then Synthesize
+
+The agent must follow these steps to conduct the review:
+
+### 1. Context Acquisition
+- Read the PR description: `gh pr view <PR_NUMBER>`.
+- Read the intent from the description and map it against the diff.
+- Run `/review-frontend <PR_NUMBER>` to get an initial static analysis report.
+
+### 2. Infrastructure Validation
+- Trust the parallel worker results in `.gemini/logs/offload-<PR_NUMBER>/`.
+- Read `build.log` to ensure the environment is stable for testing.
+- Read `ci-status.exit`. 
+- **If CI is failing**: Use the extractor to identify failing tests:
+  `npx tsx .gemini/skills/review-pr/scripts/extract-failures.ts <PR_NUMBER>`
+- Reproduce the failing tests locally and analyze the output.
+
+### 3. Behavioral Verification (The "Proof")
+- Do NOT just trust that tests pass. 
+- Physically exercise the new code in the terminal.
+- Examples of "Proof":
+    - If it's a new CLI command, run it with various flags.
+    - If it's a new service, write a small `.ts` script to call the functions.
+    - If it's a UI component, verify the Ink layout renders correctly.
+- Log these proof steps and results in your conversation.
+
+### 4. Synthesis
+- Combine all data points:
+    - **Static Analysis** (Code quality, style).
+    - **Infrastructure** (Build status, CI status).
+    - **Behavioral Proof** (Empirical verification).
+- Check for merge conflicts with `main`.
+
+## Final Assessment
+
+Provide a final recommendation with three sections:
+
+1.  **Summary**: High-level verdict (Approve / Needs Work / Reject).
+2.  **Verified Behavior**: Describe the scripts/commands you ran to prove the
+    feature works.
+3.  **Findings**: List Critical issues, Improvements, and Nitpicks.
+
+## Best Practices
+- **Be Skeptical**: Just because CI is green doesn't mean the feature is complete. Look for missing edge cases.
+- **Collaborate**: If you find a bug during verification, tell the user immediately in the main window.
+- **Don't Fix**: Your role is to **review**, not to fix. If the PR is failing, summarize the causes and request changes.
@@ -0,0 +1,59 @@
+/**
+ * CI Failure Extraction Utility
+ * 
+ * Analyzes remote CI logs to identify specific failing test files.
+ * Used by the review-pr and fix-pr skills to minimize local repro noise.
+ */
+import { spawnSync } from 'child_process';
+
+async function main() {
+  const prNumber = process.argv[2];
+  if (!prNumber) {
+    console.error('Usage: npx tsx extract-failures.ts <PR_NUMBER>');
+    process.exit(1);
+  }
+
+  console.log(`🔍 Investigating CI failures for PR #${prNumber}...`);
+
+  // 1. Get branch name
+  const branchView = spawnSync('gh', ['pr', 'view', prNumber, '--json', 'headRefName', '-q', '.headRefName'], { shell: true });
+  const branchName = branchView.stdout.toString().trim();
+
+  // 2. Get latest CI run ID
+  const runList = spawnSync('gh', ['run', 'list', '--branch', branchName, '--workflow', 'ci.yml', '--json', 'databaseId', '-q', '.[0].databaseId'], { shell: true });
+  const runId = runList.stdout.toString().trim();
+
+  if (!runId) {
+    console.log('⚠️ No recent CI runs found for this branch.');
+    process.exit(0);
+  }
+
+  // 3. Extract failing files from logs
+  // Pattern matches common test locations in the monorepo
+  const logView = spawnSync('gh', ['run', 'view', runId, '--log-failed'], { shell: true });
+  const logOutput = logView.stdout.toString();
+  
+  const testPattern = /(packages\/[a-zA-Z0-9_-]+|integration-tests|evals)\/[a-zA-Z0-9_\/-]+\.test\.ts(x)?/g;
+  const matches = logOutput.match(testPattern);
+
+  if (!matches || matches.length === 0) {
+    console.log('✅ No specific failing test files detected in logs.');
+    process.exit(0);
+  }
+
+  const uniqueFiles = Array.from(new Set(matches)).sort();
+  console.log('\n❌ Found failing test files:');
+  uniqueFiles.forEach(f => console.log(`  - ${f}`));
+
+  console.log('\n👉 Recommendation: Run these tests locally using:');
+  uniqueFiles.forEach(file => {
+    let wsDir = file.split('/')[0];
+    if (wsDir === 'packages') {
+      wsDir = file.split('/').slice(0, 2).join('/');
+    }
+    const relFile = file.replace(`${wsDir}/`, '');
+    console.log(`  npm run test:ci -w ${wsDir} -- ${relFile}`);
+  });
+}
+
+main().catch(console.error);