forked from google-gemini/gemini-cli
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathupdate_topic.eval.ts
More file actions
273 lines (246 loc) · 8.94 KB
/
Copy pathupdate_topic.eval.ts
File metadata and controls
273 lines (246 loc) · 8.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import fs from 'node:fs';
import path from 'node:path';
import { evalTest } from './test-helper.js';
describe('update_topic_behavior', () => {
// Constants for tool names and params for robustness
const UPDATE_TOPIC_TOOL_NAME = 'update_topic';
/**
* Verifies the desired behavior of the update_topic tool. update_topic is used by the
* agent to share periodic, concise updates about what the agent is working on, independent
* of the regular model output and/or thoughts. This tool is expected to be called at least
* at the start and end of the session, and typically at least once in the middle, but no
* more than 1/4 turns.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'update_topic should be used at start, end and middle for complex tasks',
prompt: `Create a simple users REST API using Express.
1. Initialize a new npm project and install express.
2. Create src/app.ts as the main entry point.
3. Create src/routes/userRoutes.ts for user routes.
4. Create src/controllers/userController.ts for user logic.
5. Implement GET /users, POST /users, and GET /users/:id using an in-memory array.
6. Add a 'start' script to package.json.
7. Finally, run a quick grep to verify the routes are in src/app.ts.`,
files: {
'package.json': JSON.stringify(
{
name: 'users-api',
version: '1.0.0',
private: true,
},
null,
2,
),
'.gemini/settings.json': JSON.stringify({
general: {
topicUpdateNarration: true,
},
}),
},
assert: async (rig, result) => {
const toolLogs = rig.readToolLogs();
const topicCalls = toolLogs.filter(
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
);
// 1. Assert that update_topic is called at least 3 times (start, middle, end)
expect(
topicCalls.length,
`Expected at least 3 update_topic calls, but found ${topicCalls.length}`,
).toBeGreaterThanOrEqual(3);
// 2. Assert update_topic is called at the very beginning (first tool call)
expect(
toolLogs[0].toolRequest.name,
'First tool call should be update_topic',
).toBe(UPDATE_TOPIC_TOOL_NAME);
// 3. Assert update_topic is called near the end
const lastTopicCallIndex = toolLogs
.map((l) => l.toolRequest.name)
.lastIndexOf(UPDATE_TOPIC_TOOL_NAME);
expect(
lastTopicCallIndex,
'Expected update_topic to be used near the end of the task',
).toBeGreaterThanOrEqual(toolLogs.length * 0.7);
// 4. Assert there is at least one update_topic call in the middle (between start and end phases)
const middleTopicCalls = topicCalls.slice(1, -1);
expect(
middleTopicCalls.length,
'Expected at least one update_topic call in the middle of the task',
).toBeGreaterThanOrEqual(1);
// 5. Turn Ratio Assertion: update_topic should be <= 1/2 of total turns.
// We only enforce this for tasks that take more than 5 turns, as shorter tasks
// naturally have a higher ratio when following the "start, middle, end" rule.
const uniquePromptIds = new Set(
toolLogs
.map((l) => l.toolRequest.prompt_id)
.filter((id) => id !== undefined),
);
const totalTurns = uniquePromptIds.size;
if (totalTurns > 5) {
const topicTurns = new Set(
topicCalls
.map((l) => l.toolRequest.prompt_id)
.filter((id) => id !== undefined),
);
const topicTurnCount = topicTurns.size;
const ratio = topicTurnCount / totalTurns;
expect(
ratio,
`update_topic was used in ${topicTurnCount} out of ${totalTurns} turns (${(ratio * 100).toFixed(1)}%). Expected <= 50%.`,
).toBeLessThanOrEqual(0.5);
// Ideal ratio is closer to 1/5 (20%). We log high usage as a warning.
if (ratio > 0.25) {
console.warn(
`[Efficiency Warning] update_topic usage is high: ${(ratio * 100).toFixed(1)}% (Goal: ~20%)`,
);
}
}
},
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'update_topic should NOT be used for informational coding tasks (Obvious)',
approvalMode: 'default',
prompt:
'Explain the difference between Map and Object in JavaScript and provide a performance-focused code snippet for each.',
files: {
'.gemini/settings.json': JSON.stringify({
general: {
topicUpdateNarration: true,
},
}),
},
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const topicCalls = toolLogs.filter(
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
);
expect(
topicCalls.length,
`Expected 0 update_topic calls for an informational task, but found ${topicCalls.length}`,
).toBe(0);
},
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'update_topic should NOT be used for surgical symbol searches (Grey Area)',
approvalMode: 'default',
prompt:
"Find the file where the 'UPDATE_TOPIC_TOOL_NAME' constant is defined.",
files: {
'packages/core/src/tools/tool-names.ts':
"export const UPDATE_TOPIC_TOOL_NAME = 'update_topic';",
'.gemini/settings.json': JSON.stringify({
general: {
topicUpdateNarration: true,
},
}),
},
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const topicCalls = toolLogs.filter(
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
);
expect(
topicCalls.length,
`Expected 0 update_topic calls for a surgical symbol search, but found ${topicCalls.length}`,
).toBe(0);
},
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'update_topic should be used for medium complexity multi-step tasks',
prompt:
'Refactor the `users-api` project. Move the routing logic from src/app.ts into a new file src/routes.ts, and update app.ts to use the new routes file.',
files: {
'package.json': JSON.stringify(
{
name: 'users-api',
version: '1.0.0',
},
null,
2,
),
'src/app.ts': `
import express from 'express';
const app = express();
app.get('/users', (req, res) => {
res.json([{id: 1, name: 'Alice'}]);
});
app.post('/users', (req, res) => {
res.status(201).send();
});
export default app;
`,
'.gemini/settings.json': JSON.stringify({
general: {
topicUpdateNarration: true,
},
}),
},
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const topicCalls = toolLogs.filter(
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
);
// This is a multi-step task (read, create new file, edit old file).
// It should clear the bar and use update_topic at least at the start and end.
expect(topicCalls.length).toBeGreaterThanOrEqual(2);
// Verify it actually did the refactoring to ensure it didn't just fail immediately
expect(fs.existsSync(path.join(rig.testDir!, 'src/routes.ts'))).toBe(
true,
);
},
});
/**
* Regression test for a bug where update_topic was called multiple times in a
* row. We have seen cases of this occurring in earlier versions of the update_topic
* system instruction, prior to https://github.com/google-gemini/gemini-cli/pull/24640.
* This test demonstrated that there are cases where it can still occur and validates
* the prompt change that improves the behavior.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'update_topic should not be called twice in a row',
prompt: `
We need to build a C compiler.
Before you write any code, you must formally declare your strategy.
First, declare that you will build a Lexer.
Then, immediately realize that is wrong and declare that you will actually build a Parser instead.
Finally, create 'parser.c'.
`,
files: {
'package.json': JSON.stringify({ name: 'test-project' }),
'.gemini/settings.json': JSON.stringify({
general: {
topicUpdateNarration: true,
},
}),
},
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// Check for back-to-back update_topic calls
for (let i = 1; i < toolLogs.length; i++) {
if (
toolLogs[i - 1].toolRequest.name === UPDATE_TOPIC_TOOL_NAME &&
toolLogs[i].toolRequest.name === UPDATE_TOPIC_TOOL_NAME
) {
throw new Error(
`Detected back-to-back ${UPDATE_TOPIC_TOOL_NAME} calls at index ${i - 1} and ${i}`,
);
}
}
},
});
});