gemini-cli/evals/ask_user.eval.ts at c4d0e3ca364c6c6733406004003bbeb0d379427a · shamshad-ansari/gemini-cli · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import { ApprovalMode, isRecord } from '@google/gemini-cli-core';
import { appEvalTest, type AppEvalCase } from './app-test-helper.js';
import { type EvalPolicy } from './test-helper.js';

function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
  const existingGeneral = evalCase.configOverrides?.['general'];
  const generalBase = isRecord(existingGeneral) ? existingGeneral : {};

  return appEvalTest(policy, {
    ...evalCase,
    configOverrides: {
      ...evalCase.configOverrides,
      approvalMode: ApprovalMode.DEFAULT,
      general: {
        ...generalBase,
        enableAutoUpdate: false,
        enableAutoUpdateNotification: false,
      },
    },
    files: {
      ...evalCase.files,
    },
  });
}

describe('ask_user', () => {
  askUserEvalTest('USUALLY_PASSES', {
    suiteName: 'default',
    suiteType: 'behavioral',
    name: 'Agent uses AskUser tool to present multiple choice options',
    prompt: `Use the ask_user tool to ask me what my favorite color is. Provide 3 options: red, green, or blue.`,
    setup: async (rig) => {
      rig.setBreakpoint(['ask_user']);
    },
    assert: async (rig) => {
      const confirmation = await rig.waitForPendingConfirmation('ask_user');
      expect(
        confirmation,
        'Expected a pending confirmation for ask_user tool',
      ).toBeDefined();
    },
  });

  askUserEvalTest('USUALLY_PASSES', {
    suiteName: 'default',
    suiteType: 'behavioral',
    name: 'Agent uses AskUser tool to clarify ambiguous requirements',
    files: {
      'package.json': JSON.stringify({ name: 'my-app', version: '1.0.0' }),
    },
    prompt: `I want to build a new feature in this app. Ask me questions to clarify the requirements before proceeding.`,
    setup: async (rig) => {
      rig.setBreakpoint(['ask_user']);
    },
    assert: async (rig) => {
      const confirmation = await rig.waitForPendingConfirmation('ask_user');
      expect(
        confirmation,
        'Expected a pending confirmation for ask_user tool',
      ).toBeDefined();
    },
  });

  askUserEvalTest('USUALLY_PASSES', {
    suiteName: 'default',
    suiteType: 'behavioral',
    name: 'Agent uses AskUser tool before performing significant ambiguous rework',
    files: {
      'packages/core/src/index.ts': '// index\nexport const version = "1.0.0";',
      'packages/core/src/util.ts': '// util\nexport function help() {}',
      'packages/core/package.json': JSON.stringify({
        name: '@google/gemini-cli-core',
      }),
      'README.md': '# Gemini CLI',
    },
    prompt: `I want to completely rewrite the core package to support the upcoming V2 architecture, but I haven't decided what that looks like yet. We need to figure out the requirements first. Can you ask me some questions to help nail down the design?`,
    setup: async (rig) => {
      rig.setBreakpoint(['enter_plan_mode', 'ask_user']);
    },
    assert: async (rig) => {
      // It might call enter_plan_mode first.
      let confirmation = await rig.waitForPendingConfirmation([
        'enter_plan_mode',
        'ask_user',
      ]);
      expect(confirmation, 'Expected a tool call confirmation').toBeDefined();

      if (confirmation?.toolName === 'enter_plan_mode') {
        await rig.resolveTool('enter_plan_mode');
        confirmation = await rig.waitForPendingConfirmation('ask_user');
      }

      expect(
        confirmation?.toolName,
        'Expected ask_user to be called to clarify the significant rework',
      ).toBe('ask_user');
    },
  });

  // --- Regression Tests for Recent Fixes ---

  // Regression test for issue #20177: Ensure the agent does not use \`ask_user\` to
  // confirm shell commands. Fixed via prompt refinements and tool definition
  // updates to clarify that shell command confirmation is handled by the UI.
  // See fix: https://github.com/google-gemini/gemini-cli/pull/20504
  askUserEvalTest('USUALLY_PASSES', {
    suiteName: 'default',
    suiteType: 'behavioral',
    name: 'Agent does NOT use AskUser to confirm shell commands',
    files: {
      'package.json': JSON.stringify({
        scripts: { build: 'echo building' },
      }),
    },
    prompt: `Run 'npm run build' in the current directory.`,
    setup: async (rig) => {
      rig.setBreakpoint(['run_shell_command', 'ask_user']);
    },
    assert: async (rig) => {
      const confirmation = await rig.waitForPendingConfirmation([
        'run_shell_command',
        'ask_user',
      ]);

      expect(
        confirmation,
        'Expected a pending confirmation for a tool',
      ).toBeDefined();

      expect(
        confirmation?.toolName,
        'ask_user should not be called to confirm shell commands',
      ).toBe('run_shell_command');
    },
  });
});