fix(policy): enhance shell command safety and parsing (#15034)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Abhi <43648792+abhipatel12@users.noreply.github.com>
This commit is contained in:
Allen Hutchison
2026-01-12 11:23:32 -08:00
committed by GitHub
parent 0167392f22
commit 64cde8d439
3 changed files with 526 additions and 30 deletions

View File

@@ -151,9 +151,13 @@ export class PolicyEngine {
serverName: string | undefined,
dir_path: string | undefined,
allowRedirection?: boolean,
): Promise<PolicyDecision> {
rule?: PolicyRule,
): Promise<{ decision: PolicyDecision; rule?: PolicyRule }> {
if (!command) {
return this.applyNonInteractiveMode(ruleDecision);
return {
decision: this.applyNonInteractiveMode(ruleDecision),
rule,
};
}
await initializeShellParsers();
@@ -163,7 +167,12 @@ export class PolicyEngine {
debugLogger.debug(
`[PolicyEngine.check] Command parsing failed for: ${command}. Falling back to ASK_USER.`,
);
return this.applyNonInteractiveMode(PolicyDecision.ASK_USER);
// Parsing logic failed, we can't trust it. Force ASK_USER (or DENY).
// We don't blame a specific rule here, unless the input rule was stricter.
return {
decision: this.applyNonInteractiveMode(PolicyDecision.ASK_USER),
rule: undefined,
};
}
// If there are multiple parts, or if we just want to validate the single part against DENY rules
@@ -173,14 +182,16 @@ export class PolicyEngine {
);
if (ruleDecision === PolicyDecision.DENY) {
return PolicyDecision.DENY;
return { decision: PolicyDecision.DENY, rule };
}
// Start optimistically. If all parts are ALLOW, the whole is ALLOW.
// We will downgrade if any part is ASK_USER or DENY.
let aggregateDecision = PolicyDecision.ALLOW;
let responsibleRule: PolicyRule | undefined;
for (const subCmd of subCommands) {
for (const rawSubCmd of subCommands) {
const subCmd = rawSubCmd.trim();
// Prevent infinite recursion for the root command
if (subCmd === command) {
if (!allowRedirection && hasRedirection(subCmd)) {
@@ -190,17 +201,16 @@ export class PolicyEngine {
// Redirection always downgrades ALLOW to ASK_USER
if (aggregateDecision === PolicyDecision.ALLOW) {
aggregateDecision = PolicyDecision.ASK_USER;
responsibleRule = undefined; // Inherent policy
}
} else {
// If the command is atomic (cannot be split further) and didn't
// trigger infinite recursion checks, we must respect the decision
// of the rule that triggered this check. If the rule was ASK_USER
// (e.g. wildcard), we must downgrade.
// Atomic command matching the rule.
if (
ruleDecision === PolicyDecision.ASK_USER &&
aggregateDecision === PolicyDecision.ALLOW
) {
aggregateDecision = PolicyDecision.ASK_USER;
responsibleRule = rule;
}
}
continue;
@@ -216,13 +226,17 @@ export class PolicyEngine {
// If any part is DENIED, the whole command is DENIED
if (subDecision === PolicyDecision.DENY) {
return PolicyDecision.DENY;
return {
decision: PolicyDecision.DENY,
rule: subResult.rule,
};
}
// If any part requires ASK_USER, the whole command requires ASK_USER
if (subDecision === PolicyDecision.ASK_USER) {
if (aggregateDecision === PolicyDecision.ALLOW) {
aggregateDecision = PolicyDecision.ASK_USER;
aggregateDecision = PolicyDecision.ASK_USER;
if (!responsibleRule) {
responsibleRule = subResult.rule;
}
}
@@ -237,13 +251,22 @@ export class PolicyEngine {
);
if (aggregateDecision === PolicyDecision.ALLOW) {
aggregateDecision = PolicyDecision.ASK_USER;
responsibleRule = undefined;
}
}
}
return this.applyNonInteractiveMode(aggregateDecision);
return {
decision: this.applyNonInteractiveMode(aggregateDecision),
// If we stayed at ALLOW, we return the original rule (if any).
// If we downgraded, we return the responsible rule (or undefined if implicit).
rule: aggregateDecision === ruleDecision ? rule : responsibleRule,
};
}
return this.applyNonInteractiveMode(ruleDecision);
return {
decision: this.applyNonInteractiveMode(ruleDecision),
rule,
};
}
/**
@@ -271,6 +294,18 @@ export class PolicyEngine {
`[PolicyEngine.check] toolCall.name: ${toolCall.name}, stringifiedArgs: ${stringifiedArgs}`,
);
// Check for shell commands upfront to handle splitting
let isShellCommand = false;
let command: string | undefined;
let shellDirPath: string | undefined;
if (toolCall.name && SHELL_TOOL_NAMES.includes(toolCall.name)) {
isShellCommand = true;
const args = toolCall.args as { command?: string; dir_path?: string };
command = args?.command;
shellDirPath = args?.dir_path;
}
// Find the first matching rule (already sorted by priority)
let matchedRule: PolicyRule | undefined;
let decision: PolicyDecision | undefined;
@@ -289,21 +324,31 @@ export class PolicyEngine {
`[PolicyEngine.check] MATCHED rule: toolName=${rule.toolName}, decision=${rule.decision}, priority=${rule.priority}, argsPattern=${rule.argsPattern?.source || 'none'}`,
);
if (toolCall.name && SHELL_TOOL_NAMES.includes(toolCall.name)) {
const args = toolCall.args as { command?: string; dir_path?: string };
decision = await this.checkShellCommand(
toolCall.name,
args?.command,
if (isShellCommand) {
const shellResult = await this.checkShellCommand(
toolCall.name!,
command,
rule.decision,
serverName,
args?.dir_path,
shellDirPath,
rule.allowRedirection,
rule,
);
decision = shellResult.decision;
if (shellResult.rule) {
matchedRule = shellResult.rule;
break;
}
// If no rule returned (e.g. downgraded to default ASK_USER due to redirection),
// we might still want to blame the matched rule?
// No, test says we should return undefined rule if implicit.
matchedRule = shellResult.rule;
break;
} else {
decision = this.applyNonInteractiveMode(rule.decision);
matchedRule = rule;
break;
}
matchedRule = rule;
break;
}
}
@@ -313,6 +358,22 @@ export class PolicyEngine {
`[PolicyEngine.check] NO MATCH - using default decision: ${this.defaultDecision}`,
);
decision = this.applyNonInteractiveMode(this.defaultDecision);
// If it's a shell command and we fell back to default, we MUST still verify subcommands!
// This is critical for security: "git commit && git push" where "git push" is DENY but "git commit" has no rule.
if (isShellCommand && decision !== PolicyDecision.DENY) {
const shellResult = await this.checkShellCommand(
toolCall.name!,
command,
decision, // default decision
serverName,
shellDirPath,
false, // no rule, so no allowRedirection
undefined, // no rule
);
decision = shellResult.decision;
matchedRule = shellResult.rule;
}
}
// If decision is not DENY, run safety checkers

View File

@@ -4,29 +4,107 @@
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect, beforeEach } from 'vitest';
import { describe, it, expect, beforeEach, vi } from 'vitest';
// Mock shell-utils to avoid relying on tree-sitter WASM which is flaky in CI on Windows
vi.mock('../utils/shell-utils.js', async (importOriginal) => {
const actual =
await importOriginal<typeof import('../utils/shell-utils.js')>();
// Static map of test commands to their expected subcommands
// This mirrors what the real parser would output for these specific strings
const commandMap: Record<string, string[]> = {
'git log': ['git log'],
'git log --oneline': ['git log --oneline'],
'git logout': ['git logout'],
'git log && rm -rf /': ['git log', 'rm -rf /'],
'git log; rm -rf /': ['git log', 'rm -rf /'],
'git log || rm -rf /': ['git log', 'rm -rf /'],
'git log &&& rm -rf /': [], // Simulates parse failure
'echo $(rm -rf /)': ['echo $(rm -rf /)', 'rm -rf /'],
'echo $(git log)': ['echo $(git log)', 'git log'],
'echo `rm -rf /`': ['echo `rm -rf /`', 'rm -rf /'],
'diff <(git log) <(rm -rf /)': [
'diff <(git log) <(rm -rf /)',
'git log',
'rm -rf /',
],
'tee >(rm -rf /)': ['tee >(rm -rf /)', 'rm -rf /'],
'git log | rm -rf /': ['git log', 'rm -rf /'],
'git log --format=$(rm -rf /)': [
'git log --format=$(rm -rf /)',
'rm -rf /',
],
'git log && echo $(git log | rm -rf /)': [
'git log',
'echo $(git log | rm -rf /)',
'git log',
'rm -rf /',
],
'git log && echo $(git log)': ['git log', 'echo $(git log)', 'git log'],
'git log > /tmp/test': ['git log > /tmp/test'],
'git log @(Get-Process)': [], // Simulates parse failure (Bash parser vs PowerShell syntax)
'git commit -m "msg" && git push': ['git commit -m "msg"', 'git push'],
'git status && unknown_command': ['git status', 'unknown_command'],
'unknown_command_1 && another_unknown_command': [
'unknown_command_1',
'another_unknown_command',
],
'known_ask_command_1 && known_ask_command_2': [
'known_ask_command_1',
'known_ask_command_2',
],
};
return {
...actual,
initializeShellParsers: vi.fn(),
splitCommands: (command: string) => {
if (Object.prototype.hasOwnProperty.call(commandMap, command)) {
return commandMap[command];
}
const known = commandMap[command];
if (known) return known;
// Default fallback for unmatched simple cases in development, but explicit map is better
return [command];
},
hasRedirection: (command: string) =>
// Simple regex check sufficient for testing the policy engine's handling of the *result* of hasRedirection
/[><]/.test(command),
};
});
import { PolicyEngine } from './policy-engine.js';
import { PolicyDecision, ApprovalMode } from './types.js';
import type { FunctionCall } from '@google/genai';
import { buildArgsPatterns } from './utils.js';
describe('Shell Safety Policy', () => {
let policyEngine: PolicyEngine;
beforeEach(() => {
policyEngine = new PolicyEngine({
// Helper to create a policy engine with a simple command prefix rule
function createPolicyEngineWithPrefix(prefix: string) {
const argsPatterns = buildArgsPatterns(undefined, prefix, undefined);
// Since buildArgsPatterns returns array of patterns (strings), we pick the first one
// and compile it.
const argsPattern = new RegExp(argsPatterns[0]!);
return new PolicyEngine({
rules: [
{
toolName: 'run_shell_command',
// Mimic the regex generated by toml-loader for commandPrefix = ["git log"]
// Regex: "command":"git log(?:[\s"]|$)
argsPattern: /"command":"git log(?:[\s"]|$)/,
argsPattern,
decision: PolicyDecision.ALLOW,
priority: 1.01, // Higher priority than default
priority: 1.01,
},
],
defaultDecision: PolicyDecision.ASK_USER,
approvalMode: ApprovalMode.DEFAULT,
});
}
beforeEach(() => {
policyEngine = createPolicyEngineWithPrefix('git log');
});
it('SHOULD match "git log" exactly', async () => {
@@ -71,6 +149,25 @@ describe('Shell Safety Policy', () => {
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD NOT allow "git log; rm -rf /" (semicolon separator)', async () => {
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'git log; rm -rf /' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD NOT allow "git log || rm -rf /" (OR separator)', async () => {
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'git log || rm -rf /' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD NOT allow "git log &&& rm -rf /" when prefix is "git log" (parse failure)', async () => {
const toolCall: FunctionCall = {
name: 'run_shell_command',
@@ -78,8 +175,341 @@ describe('Shell Safety Policy', () => {
};
// Desired behavior: Should fail safe (ASK_USER or DENY) because parsing failed.
// If we let it pass as "single command" that matches prefix, it's dangerous.
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD NOT allow command substitution $(rm -rf /)', async () => {
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'echo $(rm -rf /)' },
};
// `splitCommands` recursively finds nested commands (e.g., `rm` inside `echo $()`).
// The policy engine requires ALL extracted commands to be allowed.
// Since `rm` does not match the allowed prefix, this should result in ASK_USER.
const echoPolicy = createPolicyEngineWithPrefix('echo');
const result = await echoPolicy.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD allow command substitution if inner command is ALSO allowed', async () => {
// Both `echo` and `git` allowed.
const argsPatternsEcho = buildArgsPatterns(undefined, 'echo', undefined);
const argsPatternsGit = buildArgsPatterns(undefined, 'git', undefined); // Allow all git
const policyEngineWithBoth = new PolicyEngine({
rules: [
{
toolName: 'run_shell_command',
argsPattern: new RegExp(argsPatternsEcho[0]!),
decision: PolicyDecision.ALLOW,
priority: 2,
},
{
toolName: 'run_shell_command',
argsPattern: new RegExp(argsPatternsGit[0]!),
decision: PolicyDecision.ALLOW,
priority: 2,
},
],
defaultDecision: PolicyDecision.ASK_USER,
});
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'echo $(git log)' },
};
const result = await policyEngineWithBoth.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ALLOW);
});
it('SHOULD NOT allow command substitution with backticks `rm -rf /`', async () => {
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'echo `rm -rf /`' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD NOT allow process substitution <(rm -rf /)', async () => {
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'diff <(git log) <(rm -rf /)' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD NOT allow process substitution >(rm -rf /)', async () => {
// Note: >(...) is output substitution, but syntax is similar.
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'tee >(rm -rf /)' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD NOT allow piped commands "git log | rm -rf /"', async () => {
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'git log | rm -rf /' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD NOT allow argument injection via --arg=$(rm -rf /)', async () => {
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'git log --format=$(rm -rf /)' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD NOT allow complex nested commands "git log && echo $(git log | rm -rf /)"', async () => {
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'git log && echo $(git log | rm -rf /)' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD allow complex allowed commands "git log && echo $(git log)"', async () => {
// Both `echo` and `git` allowed.
const argsPatternsEcho = buildArgsPatterns(undefined, 'echo', undefined);
const argsPatternsGit = buildArgsPatterns(undefined, 'git', undefined);
const policyEngineWithBoth = new PolicyEngine({
rules: [
{
toolName: 'run_shell_command',
argsPattern: new RegExp(argsPatternsEcho[0]!),
decision: PolicyDecision.ALLOW,
priority: 2,
},
{
toolName: 'run_shell_command',
// Matches "git" at start of *subcommand*
argsPattern: new RegExp(argsPatternsGit[0]!),
decision: PolicyDecision.ALLOW,
priority: 2,
},
],
defaultDecision: PolicyDecision.ASK_USER,
});
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'git log && echo $(git log)' },
};
const result = await policyEngineWithBoth.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ALLOW);
});
it('SHOULD NOT allow generic redirection > /tmp/test', async () => {
// Current logic downgrades ALLOW to ASK_USER for redirections if redirection is not explicitly allowed.
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'git log > /tmp/test' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD allow generic redirection > /tmp/test if allowRedirection is true', async () => {
// If PolicyRule has allowRedirection: true, it should stay ALLOW
const argsPatternsGitLog = buildArgsPatterns(
undefined,
'git log',
undefined,
);
const policyWithRedirection = new PolicyEngine({
rules: [
{
toolName: 'run_shell_command',
argsPattern: new RegExp(argsPatternsGitLog[0]!),
decision: PolicyDecision.ALLOW,
priority: 2,
allowRedirection: true,
},
],
defaultDecision: PolicyDecision.ASK_USER,
});
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'git log > /tmp/test' },
};
const result = await policyWithRedirection.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ALLOW);
});
it('SHOULD NOT allow PowerShell @(...) usage if it implies code execution', async () => {
// Bash parser fails on PowerShell syntax @(...) (returns empty subcommands).
// The policy engine correctly identifies this as unparseable and falls back to ASK_USER.
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'git log @(Get-Process)' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
});
it('SHOULD match DENY rule even if nested/chained with unknown command', async () => {
// Scenario:
// git commit -m "..." (Unknown/No Rule -> ASK_USER)
// git push (DENY -> DENY)
// Overall should be DENY.
const argsPatternsPush = buildArgsPatterns(
undefined,
'git push',
undefined,
);
const denyPushPolicy = new PolicyEngine({
rules: [
{
toolName: 'run_shell_command',
argsPattern: new RegExp(argsPatternsPush[0]!),
decision: PolicyDecision.DENY,
priority: 2,
},
],
defaultDecision: PolicyDecision.ASK_USER,
});
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'git commit -m "msg" && git push' },
};
const result = await denyPushPolicy.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.DENY);
});
it('SHOULD aggregate ALLOW + ASK_USER to ASK_USER and blame the ASK_USER part', async () => {
// Scenario:
// `git status` (ALLOW) && `unknown_command` (ASK_USER by default)
// Expected: ASK_USER, and the matched rule should be related to the unknown_command
const argsPatternsGitStatus = buildArgsPatterns(
undefined,
'git status',
undefined,
);
const policyEngine = new PolicyEngine({
rules: [
{
toolName: 'run_shell_command',
argsPattern: new RegExp(argsPatternsGitStatus[0]!),
decision: PolicyDecision.ALLOW,
priority: 2,
name: 'allow_git_status_rule', // Give a name to easily identify
},
],
defaultDecision: PolicyDecision.ASK_USER,
});
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'git status && unknown_command' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
// Expect the matched rule to be null/undefined since it's the default decision for 'unknown_command'
// or the rule that led to the ASK_USER decision. In this case, it should be the rule for 'unknown_command', which is the default decision.
// The policy engine's `matchedRule` will be the rule that caused the final decision.
// If it's a default ASK_USER, then `result.rule` should be undefined.
expect(result.rule).toBeUndefined();
});
it('SHOULD aggregate ASK_USER (default) + ASK_USER (rule) to ASK_USER and blame the specific ASK_USER rule', async () => {
// Scenario:
// `unknown_command_1` (ASK_USER by default) && `another_unknown_command` (ASK_USER by explicit rule)
// Expected: ASK_USER, and the matched rule should be the explicit ASK_USER rule
const argsPatternsAnotherUnknown = buildArgsPatterns(
undefined,
'another_unknown_command',
undefined,
);
const policyEngine = new PolicyEngine({
rules: [
{
toolName: 'run_shell_command',
argsPattern: new RegExp(argsPatternsAnotherUnknown[0]!),
decision: PolicyDecision.ASK_USER,
priority: 2,
name: 'ask_another_unknown_command_rule',
},
],
defaultDecision: PolicyDecision.ASK_USER,
});
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'unknown_command_1 && another_unknown_command' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
// The first command triggers default ASK_USER (undefined rule).
// The second triggers explicit ASK_USER rule.
// We attribute to the first cause => undefined.
expect(result.rule).toBeUndefined();
});
it('SHOULD aggregate ASK_USER (rule) + ASK_USER (rule) to ASK_USER and blame the first specific ASK_USER rule in subcommands', async () => {
// Scenario:
// `known_ask_command_1` (ASK_USER by explicit rule 1) && `known_ask_command_2` (ASK_USER by explicit rule 2)
// Expected: ASK_USER, and the matched rule should be explicit ASK_USER rule 1.
// The current implementation prioritizes the rule that changes the decision to ASK_USER, if any.
// If multiple rules lead to ASK_USER, it takes the first one.
const argsPatternsAsk1 = buildArgsPatterns(
undefined,
'known_ask_command_1',
undefined,
);
const argsPatternsAsk2 = buildArgsPatterns(
undefined,
'known_ask_command_2',
undefined,
);
const policyEngine = new PolicyEngine({
rules: [
{
toolName: 'run_shell_command',
argsPattern: new RegExp(argsPatternsAsk1[0]!),
decision: PolicyDecision.ASK_USER,
priority: 2,
name: 'ask_rule_1',
},
{
toolName: 'run_shell_command',
argsPattern: new RegExp(argsPatternsAsk2[0]!),
decision: PolicyDecision.ASK_USER,
priority: 2,
name: 'ask_rule_2',
},
],
defaultDecision: PolicyDecision.ALLOW, // Set default to ALLOW to ensure rules are hit
});
const toolCall: FunctionCall = {
name: 'run_shell_command',
args: { command: 'known_ask_command_1 && known_ask_command_2' },
};
const result = await policyEngine.check(toolCall, undefined);
expect(result.decision).toBe(PolicyDecision.ASK_USER);
// Expect the rule that first caused ASK_USER to be blamed
expect(result.rule?.name).toBe('ask_rule_1');
});
});

View File

@@ -95,6 +95,11 @@ export type SafetyCheckerConfig =
| InProcessCheckerConfig;
export interface PolicyRule {
/**
* A unique name for the policy rule, useful for identification and debugging.
*/
name?: string;
/**
* The name of the tool this rule applies to.
* If undefined, the rule applies to all tools.