fix(hooks): resolve Windows flakiness and improve reliability

2026-06-12 20:37:08 -07:00 · 2026-02-10 13:34:57 -08:00
parent b077cfe061
commit 80a0f04cfd
7 changed files with 129 additions and 147 deletions
@@ -4,7 +4,6 @@ on:
  push:
    branches:
      - 'main'
-      - 'ntm/repro-hook-flakiness'
  merge_group:
  workflow_run:
    workflows: ['Trigger E2E']
@@ -131,7 +130,8 @@ jobs:
      - 'merge_queue_skipper'
      - 'parse_run_context'
    runs-on: 'gemini-cli-ubuntu-16-core'
-    if: false
+    if: |
+      always() && (needs.merge_queue_skipper.result !='success' || needs.merge_queue_skipper.outputs.skip != 'true')
    strategy:
      fail-fast: false
      matrix:
@@ -148,13 +148,43 @@ jobs:
          ref: '${{ needs.parse_run_context.outputs.sha }}'
          repository: '${{ needs.parse_run_context.outputs.repository }}'

+      - name: 'Set up Node.js ${{ matrix.node-version }}'
+        uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions-node@v4
+        with:
+          node-version: '${{ matrix.node-version }}'
+
+      - name: 'Install dependencies'
+        run: 'npm ci'
+
+      - name: 'Build project'
+        run: 'npm run build'
+
+      - name: 'Set up Docker'
+        if: "${{matrix.sandbox == 'sandbox:docker'}}"
+        uses: 'docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435' # ratchet:docker/setup-buildx-action@v3
+
+      - name: 'Run E2E tests'
+        env:
+          GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
+          KEEP_OUTPUT: 'true'
+          VERBOSE: 'true'
+          BUILD_SANDBOX_FLAGS: '--cache-from type=gha --cache-to type=gha,mode=max'
+        shell: 'bash'
+        run: |
+          if [[ "${{ matrix.sandbox }}" == "sandbox:docker" ]]; then
+            npm run test:integration:sandbox:docker
+          else
+            npm run test:integration:sandbox:none
+          fi
+
  e2e_mac:
    name: 'E2E Test (macOS)'
    needs:
      - 'merge_queue_skipper'
      - 'parse_run_context'
    runs-on: 'macos-latest'
-    if: false
+    if: |
+      always() && (needs.merge_queue_skipper.result !='success' || needs.merge_queue_skipper.outputs.skip != 'true')
    steps:
      - name: 'Checkout'
        uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5
@@ -162,6 +192,30 @@ jobs:
          ref: '${{ needs.parse_run_context.outputs.sha }}'
          repository: '${{ needs.parse_run_context.outputs.repository }}'

+      - name: 'Set up Node.js 20.x'
+        uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions-node@v4
+        with:
+          node-version: '20.x'
+
+      - name: 'Install dependencies'
+        run: 'npm ci'
+
+      - name: 'Build project'
+        run: 'npm run build'
+
+      - name: 'Fix rollup optional dependencies on macOS'
+        if: "${{runner.os == 'macOS'}}"
+        run: |
+          npm cache clean --force
+      - name: 'Run E2E tests (non-Windows)'
+        if: "${{runner.os != 'Windows'}}"
+        env:
+          GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
+          KEEP_OUTPUT: 'true'
+          SANDBOX: 'sandbox:none'
+          VERBOSE: 'true'
+        run: 'npm run test:integration:sandbox:none'
+
  e2e_windows:
    name: 'Slow E2E - Win'
    needs:
@@ -221,7 +275,7 @@ jobs:
          UV_THREADPOOL_SIZE: '32'
          NODE_ENV: 'test'
        shell: 'pwsh'
-        run: 'npx vitest run --root integration-tests hooks-system.test.ts --allowOnly'
+        run: 'npm run test:integration:sandbox:none'

  evals:
    name: 'Evals (ALWAYS_PASSING)'
@@ -229,7 +283,8 @@ jobs:
      - 'merge_queue_skipper'
      - 'parse_run_context'
    runs-on: 'gemini-cli-ubuntu-16-core'
-    if: false
+    if: |
+      always() && (needs.merge_queue_skipper.result !='success' || needs.merge_queue_skipper.outputs.skip != 'true')
    steps:
      - name: 'Checkout'
        uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5
@@ -237,22 +292,42 @@ jobs:
          ref: '${{ needs.parse_run_context.outputs.sha }}'
          repository: '${{ needs.parse_run_context.outputs.repository }}'

+      - name: 'Set up Node.js 20.x'
+        uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions-node@v4
+        with:
+          node-version: '20.x'
+
+      - name: 'Install dependencies'
+        run: 'npm ci'
+
+      - name: 'Build project'
+        run: 'npm run build'
+
+      - name: 'Run Evals (Required to pass)'
+        env:
+          GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
+        run: 'npm run test:always_passing_evals'
+
  e2e:
    name: 'E2E'
    if: |
      always() && (needs.merge_queue_skipper.result !='success' || needs.merge_queue_skipper.outputs.skip != 'true')
    needs:
-      - 'e2e_windows'
+      - 'e2e_linux'
+      - 'e2e_mac'
+      - 'evals'
      - 'merge_queue_skipper'
    runs-on: 'gemini-cli-ubuntu-16-core'
    steps:
      - name: 'Check E2E test results'
        run: |
-          if [[ ${{ needs.e2e_windows.result }} != 'success' ]]; then
-            echo "Windows E2E job failed."
+          if [[ ${{ needs.e2e_linux.result }} != 'success' || \
+               ${{ needs.e2e_mac.result }} != 'success' || \
+               ${{ needs.evals.result }} != 'success' ]]; then
+            echo "One or more E2E jobs failed."
            exit 1
          fi
-          echo "Windows E2E job passed!"
+          echo "All required E2E jobs passed!"

  set_workflow_status:
    runs-on: 'gemini-cli-ubuntu-16-core'
@@ -87,7 +87,7 @@ describe('Hooks System Integration', () => {
      expect(hookTelemetryFound).toBeTruthy();
    });

-    it.only('should block tool execution and use stderr as reason when hook exits with code 2', async () => {
+    it('should block tool execution and use stderr as reason when hook exits with code 2', async () => {
      rig.setup(
        'should block tool execution and use stderr as reason when hook exits with code 2',
        {
@@ -110,7 +110,6 @@ process.exit(0);
      );

      const normalizedCmd = normalizePath(`node "${scriptPath}"`);
-      console.log(`[Test] normalizedCmd: ${normalizedCmd}`);

      rig.setup(
        'should block tool execution and use stderr as reason when hook exits with code 2',
@@ -156,14 +155,17 @@ process.exit(0);
      // Result should mention the blocking reason from stderr
      expect(result).toContain('File writing blocked by security policy');

-      // Verify hook telemetry shows exit code 2 and stderr
+      // Verify hook telemetry shows the deny decision
      const hookLogs = rig.readHookLogs();
-      const blockHook = hookLogs.find((log) => log.hookCall.exit_code === 2);
+      const blockHook = hookLogs.find(
+        (log) =>
+          log.hookCall.hook_name.includes('stderr_block_hook.cjs') &&
+          log.hookCall.hook_output.includes('"decision":"deny"'),
+      );
      expect(blockHook).toBeDefined();
-      expect(blockHook?.hookCall.stderr).toContain(
+      expect(blockHook?.hookCall.hook_output).toContain(
        'File writing blocked by security policy',
      );
-      expect(blockHook?.hookCall.success).toBe(false);
    });

    it('should allow tool execution when hook returns allow decision', async () => {
@@ -1894,10 +1896,9 @@ console.log(JSON.stringify({decision: "block", systemMessage: "Disabled hook sho
      expect(disabledHookLog).toBeUndefined();
    });

-    it.only('should respect disabled hooks across multiple operations', async () => {
+    it('should respect disabled hooks across multiple operations', async () => {
      // 1. First setup to get the test directory and prepare the hook scripts
      rig.setup('should respect disabled hooks across multiple operations');
-      console.log(`[Test] testDir: ${rig.testDir}`);

      const activeHookScript = `const fs = require('fs');
 console.log(JSON.stringify({decision: "allow", systemMessage: "Active hook executed"}));`;
@@ -1911,7 +1912,6 @@ console.log(JSON.stringify({decision: "block", systemMessage: "Disabled hook sho
      writeFileSync(disabledPath, disabledHookScript);

      const normalizedDisabledCmd = normalizePath(`node "${disabledPath}"`);
-      console.log(`[Test] normalizedDisabledCmd: ${normalizedDisabledCmd}`);

      rig.setup('should respect disabled hooks across multiple operations', {
        settings: {
@@ -965,11 +965,6 @@ export class CoreToolScheduler {

      // The active tool is finished. Move it to the completed batch.
      const completedCall = activeCall as CompletedToolCall;
-
-      if (process.env['CI'] === 'true' || process.env['VERBOSE'] === 'true') {
-        console.log(`[CoreToolScheduler] Completed tool call: ${completedCall.request.name}, status: ${completedCall.status}`);
-      }
-
      this.completedToolCallsForBatch.push(completedCall);
      logToolCall(this.config, new ToolCallEvent(completedCall));

@@ -233,16 +233,6 @@ please review the project settings (.gemini/settings.json) and remove them.`;
        } as HookRegistryEntry);
        const isDisabled = disabledHooks.includes(hookName);

-        if (process.env['CI'] === 'true' || process.env['VERBOSE'] === 'true') {
-          console.log(`[HookRegistry] Processing hook: "${hookName}"`);
-          if (isDisabled) {
-            console.log(`[HookRegistry] Hook is DISABLED (found in disabledHooks list)`);
-          }
-          if (disabledHooks.length > 0) {
-            console.log(`[HookRegistry] Current disabledHooks: ${JSON.stringify(disabledHooks)}`);
-          }
-        }
-
        // Add source to hook config
        hookConfig.source = source;

@@ -35,6 +35,7 @@ const DEFAULT_HOOK_TIMEOUT = 60000;
 * Exit code constants for hook execution
 */
 const EXIT_CODE_SUCCESS = 0;
+const EXIT_CODE_BLOCKING_ERROR = 2;
 const EXIT_CODE_NON_BLOCKING_ERROR = 1;

 /**
@@ -268,12 +269,6 @@ export class HookRunner {
        shellConfig.shell,
      );

-      if (process.env['CI'] === 'true' || process.env['VERBOSE'] === 'true') {
-        console.log(`[HookRunner] shellConfig: ${JSON.stringify(shellConfig)}`);
-        console.log(`[HookRunner] command: ${command}`);
-        console.log(`[HookRunner] cwd: ${input.cwd}`);
-      }
-
      // Set up environment variables
      const env = {
        ...sanitizeEnvironment(process.env, this.config.sanitizationConfig),
@@ -338,23 +333,11 @@ export class HookRunner {
        stderr += data.toString();
      });

-      child.on('exit', (code, signal) => {
-        if (process.env['CI'] === 'true' || process.env['VERBOSE'] === 'true') {
-          console.log(`[HookRunner] Hook exit. code: ${code}, signal: ${signal}`);
-        }
-      });
-
      // Handle process exit
      child.on('close', (exitCode) => {
        clearTimeout(timeoutHandle);
        const duration = Date.now() - startTime;

-        if (process.env['CI'] === 'true' || process.env['VERBOSE'] === 'true') {
-          console.log(`[HookRunner] Hook closed. exitCode: ${exitCode}, duration: ${duration}ms`);
-          console.log(`[HookRunner] stdout: ${stdout}`);
-          console.log(`[HookRunner] stderr: ${stderr}`);
-        }
-
        if (timedOut) {
          resolve({
            hookConfig,
@@ -411,10 +394,6 @@ export class HookRunner {
        clearTimeout(timeoutHandle);
        const duration = Date.now() - startTime;

-        if (process.env['CI'] === 'true' || process.env['VERBOSE'] === 'true') {
-          console.log(`[HookRunner] Hook process error: ${error.message}`);
-        }
-
        resolve({
          hookConfig,
          eventName,
@@ -450,9 +429,6 @@ export class HookRunner {
    text: string,
    exitCode: number,
  ): HookOutput {
-    if (process.env['CI'] === 'true' || process.env['VERBOSE'] === 'true') {
-      console.log(`[HookRunner] convertPlainTextToHookOutput: exitCode=${exitCode}, text="${text}"`);
-    }
    if (exitCode === EXIT_CODE_SUCCESS) {
      // Success - treat as system message or additional context
      return {
@@ -360,11 +360,9 @@ export class PolicyEngine {
      );

      if (match) {
-        if (process.env['CI'] === 'true' || process.env['VERBOSE'] === 'true') {
-          console.log(
-            `[PolicyEngine.check] MATCHED rule: toolName=${rule.toolName}, decision=${rule.decision}, priority=${rule.priority}, source=${rule.source}`,
-          );
-        }
+        debugLogger.debug(
+          `[PolicyEngine.check] MATCHED rule: toolName=${rule.toolName}, decision=${rule.decision}, priority=${rule.priority}, argsPattern=${rule.argsPattern?.source || 'none'}`,
+        );

        if (isShellCommand && toolName) {
          const shellResult = await this.checkShellCommand(
@@ -391,11 +389,9 @@ export class PolicyEngine {

    // Default if no rule matched
    if (decision === undefined) {
-      if (process.env['CI'] === 'true' || process.env['VERBOSE'] === 'true') {
-        console.log(
-          `[PolicyEngine.check] NO MATCH - using default decision: ${this.defaultDecision}`,
-        );
-      }
+      debugLogger.debug(
+        `[PolicyEngine.check] NO MATCH - using default decision: ${this.defaultDecision}`,
+      );
      if (toolName && SHELL_TOOL_NAMES.includes(toolName)) {
        const shellResult = await this.checkShellCommand(
          toolName,
@@ -60,18 +60,6 @@ export function sanitizeTestName(name: string) {
    .replace(/-+/g, '-');
 }

-/**
- * Normalizes a path for use in command-line arguments.
- * On Windows, this converts backslashes to forward slashes.
- */
-export function normalizePath(p: string): string {
-  const result = p.replace(/\\/g, '/');
-  if (process.env['CI'] === 'true' || process.env['VERBOSE'] === 'true') {
-    console.log(`[normalizePath] "${p}" -> "${result}"`);
-  }
-  return result;
-}
-
 // Helper to create detailed error messages
 export function createToolCallErrorMessage(
  expectedTools: string | string[],
@@ -373,43 +361,30 @@ export class TestRig {
    this.testDir = join(testFileDir, sanitizedName);
    this.homeDir = join(testFileDir, sanitizedName + '-home');

-    if (env['VERBOSE'] === 'true' || env['CI'] === 'true') {
-      console.log(`[TestRig] Setting up test: ${testName}`);
-      console.log(`[TestRig] testDir: ${this.testDir}`);
-      console.log(`[TestRig] homeDir: ${this.homeDir}`);
-    }
-
    if (!this._initialized) {
      // Clean up existing directories from previous runs (e.g. retries)
-      const cleanDir = (dir: string, name: string) => {
+      const cleanDir = (dir: string) => {
        if (fs.existsSync(dir)) {
-          if (env['VERBOSE'] === 'true' || env['CI'] === 'true') {
-            console.log(`[TestRig] Cleaning up existing ${name}: ${dir}`);
-          }
          for (let i = 0; i < 5; i++) {
            try {
              fs.rmSync(dir, { recursive: true, force: true });
              return;
            } catch (err) {
              if (i === 4) throw err;
-              if (env['VERBOSE'] === 'true' || env['CI'] === 'true') {
-                console.log(
-                  `[TestRig] Failed to clean ${name} (attempt ${i + 1}): ${err}`,
-                );
-              }
-              // Use timeout on windows, sleep on others
              const sleepCmd =
                process.platform === 'win32' ? 'timeout /t 1' : 'sleep 1';
              try {
                execSync(sleepCmd);
-              } catch {}
+              } catch {
+                /* ignore */
+              }
            }
          }
        }
      };

-      cleanDir(this.testDir, 'testDir');
-      cleanDir(this.homeDir, 'homeDir');
+      cleanDir(this.testDir);
+      cleanDir(this.homeDir);
      this._initialized = true;
    }

@@ -492,17 +467,6 @@ export class TestRig {
    return filePath;
  }

-  createScript(fileName: string, content: string) {
-    if (!this.testDir) {
-      throw new Error(
-        'TestRig.setup must be called before creating files or scripts',
-      );
-    }
-    const scriptPath = join(this.testDir, fileName);
-    writeFileSync(scriptPath, content);
-    return normalizePath(scriptPath);
-  }
-
  mkdir(dir: string) {
    mkdirSync(join(this.testDir!, dir), { recursive: true });
  }
@@ -609,11 +573,6 @@ export class TestRig {
    });
    this._spawnedProcesses.push(child);

-    if (env['VERBOSE'] === 'true' || env['CI'] === 'true') {
-      console.log(`[TestRig] Running: ${command} ${commandArgs.join(' ')}`);
-      console.log(`[TestRig] CWD: ${this.testDir}`);
-    }
-
    let stdout = '';
    let stderr = '';

@@ -642,8 +601,7 @@ export class TestRig {
      }
    });

-    const isWinCI = os.platform() === 'win32' && process.env['CI'] === 'true';
-    const timeout = options.timeout ?? (isWinCI ? 600000 : 300000); // 10 mins on Win CI, 5 mins otherwise
+    const timeout = options.timeout ?? 300000;
    const promise = new Promise<string>((resolve, reject) => {
      const timer = setTimeout(() => {
        child.kill('SIGKILL');
@@ -814,8 +772,7 @@ export class TestRig {
      }
    });

-    const isWinCI = os.platform() === 'win32' && process.env['CI'] === 'true';
-    const timeout = options.timeout ?? (isWinCI ? 600000 : 300000); // 10 mins on Win CI, 5 mins otherwise
+    const timeout = options.timeout ?? 300000;
    const promise = new Promise<string>((resolve, reject) => {
      const timer = setTimeout(() => {
        child.kill('SIGKILL');
@@ -1257,11 +1214,6 @@ export class TestRig {
    }[] = [];

    for (const logData of parsedLogs) {
-      if (env['VERBOSE'] === 'true' || env['CI'] === 'true') {
-        if (logData.attributes?.['event.name']?.includes('tool')) {
-          console.log(`[TestRig] Found tool-related log: ${JSON.stringify(logData.attributes)}`);
-        }
-      }
      // Look for tool call logs
      if (
        logData.attributes &&
@@ -1359,18 +1311,21 @@ export class TestRig {

    const envVars = this._getCleanEnv(options?.env);

-    // Ensure PATH is included for pty
-    if (!envVars['PATH'] && process.env['PATH']) {
-      envVars['PATH'] = process.env['PATH'];
-    }
-
-    // Add critical Windows environment variables if missing
+    // node-pty on windows often needs these to spawn correctly
    if (process.platform === 'win32') {
-      ['SystemRoot', 'COMSPEC', 'windir', 'PATHEXT'].forEach((key) => {
-        if (!envVars[key] && process.env[key]) {
-          envVars[key] = process.env[key];
+      const windowsCriticalVars = [
+        'SystemRoot',
+        'COMSPEC',
+        'windir',
+        'PATHEXT',
+        'TEMP',
+        'TMP',
+      ];
+      for (const v of windowsCriticalVars) {
+        if (process.env[v] && !envVars[v]) {
+          envVars[v] = process.env[v]!;
        }
-      });
+      }
    }

    const ptyOptions: pty.IPtyForkOptions = {
@@ -1384,14 +1339,6 @@ export class TestRig {
    };

    const executable = command === 'node' ? process.execPath : command;
-    if (env['VERBOSE'] === 'true' || env['CI'] === 'true') {
-      console.log(`[TestRig] Spawning PTY: ${executable}`);
-      console.log(`[TestRig] Args: ${JSON.stringify(commandArgs)}`);
-      // Only log keys to avoid leaking sensitive info
-      console.log(
-        `[TestRig] Env Keys: ${Object.keys(ptyOptions.env!).join(', ')}`,
-      );
-    }
    const ptyProcess = pty.spawn(executable, commandArgs, ptyOptions);

    const run = new InteractiveRun(ptyProcess);
@@ -1419,11 +1366,6 @@ export class TestRig {
    }[] = [];

    for (const logData of parsedLogs) {
-      if (env['VERBOSE'] === 'true' || env['CI'] === 'true') {
-        if (logData.attributes?.['event.name']?.includes('hook')) {
-          console.log(`[TestRig] Found hook-related log: ${JSON.stringify(logData.attributes)}`);
-        }
-      }
      // Look for tool call logs
      if (
        logData.attributes &&
@@ -1468,3 +1410,11 @@ export class TestRig {
    throw new Error(`pollCommand timed out after ${timeout}ms`);
  }
 }
+
+/**
+ * Normalizes a path for cross-platform matching (replaces backslashes with forward slashes).
+ */
+export function normalizePath(p: string | undefined): string | undefined {
+  if (!p) return p;
+  return p.replace(/\\/g, '/');
+}