forked from google-gemini/gemini-cli
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgeneralist_delegation.eval.ts
More file actions
169 lines (160 loc) · 4.58 KB
/
Copy pathgeneralist_delegation.eval.ts
File metadata and controls
169 lines (160 loc) · 4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { appEvalTest } from './app-test-helper.js';
describe('generalist_delegation', () => {
// --- Positive Evals (Should Delegate) ---
appEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should delegate batch error fixing to generalist agent',
configOverrides: {
agents: {
overrides: {
generalist: { enabled: true },
},
},
experimental: {
enableAgents: true,
},
},
files: {
'file1.ts': 'console.log("no semi")',
'file2.ts': 'console.log("no semi")',
'file3.ts': 'console.log("no semi")',
'file4.ts': 'console.log("no semi")',
'file5.ts': 'console.log("no semi")',
'file6.ts': 'console.log("no semi")',
'file7.ts': 'console.log("no semi")',
'file8.ts': 'console.log("no semi")',
'file9.ts': 'console.log("no semi")',
'file10.ts': 'console.log("no semi")',
},
prompt:
'I have 10 files (file1.ts to file10.ts) that are missing semicolons. Can you fix them?',
setup: async (rig) => {
rig.setBreakpoint(['generalist']);
},
assert: async (rig) => {
const confirmation = await rig.waitForPendingConfirmation(
'generalist',
60000,
);
expect(
confirmation,
'Expected a tool call for generalist agent',
).toBeTruthy();
await rig.resolveTool(confirmation);
await rig.waitForIdle(60000);
},
});
appEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should autonomously delegate complex batch task to generalist agent',
configOverrides: {
agents: {
overrides: {
generalist: { enabled: true },
},
},
experimental: {
enableAgents: true,
},
},
files: {
'src/a.ts': 'export const a = 1;',
'src/b.ts': 'export const b = 2;',
'src/c.ts': 'export const c = 3;',
'src/d.ts': 'export const d = 4;',
'src/e.ts': 'export const e = 5;',
},
prompt:
'Please update all files in the src directory. For each file, add a comment at the top that says "Processed by Gemini".',
setup: async (rig) => {
rig.setBreakpoint(['generalist']);
},
assert: async (rig) => {
const confirmation = await rig.waitForPendingConfirmation(
'generalist',
60000,
);
expect(
confirmation,
'Expected autonomously delegate to generalist for batch task',
).toBeTruthy();
await rig.resolveTool(confirmation);
await rig.waitForIdle(60000);
},
});
// --- Negative Evals (Should NOT Delegate - Assertive Handling) ---
appEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should NOT delegate simple read and fix to generalist agent',
configOverrides: {
agents: {
overrides: {
generalist: { enabled: true },
},
},
experimental: {
enableAgents: true,
},
},
files: {
'README.md': 'This is a proyect.',
},
prompt:
'There is a typo in README.md ("proyect"). Please fix it to "project".',
setup: async (rig) => {
// Break on everything to see what it calls
rig.setBreakpoint(['*']);
},
assert: async (rig) => {
await rig.drainBreakpointsUntilIdle((confirmation) => {
expect(
confirmation.toolName,
`Agent should NOT have delegated to generalist.`,
).not.toBe('generalist');
});
const output = rig.getStaticOutput();
expect(output).toMatch(/project/i);
},
});
appEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should NOT delegate simple direct question to generalist agent',
configOverrides: {
agents: {
overrides: {
generalist: { enabled: true },
},
},
experimental: {
enableAgents: true,
},
},
files: {
'src/VERSION': '1.2.3',
},
prompt: 'Can you tell me the version number in the src folder?',
setup: async (rig) => {
rig.setBreakpoint(['*']);
},
assert: async (rig) => {
await rig.drainBreakpointsUntilIdle((confirmation) => {
expect(
confirmation.toolName,
`Agent should NOT have delegated to generalist.`,
).not.toBe('generalist');
});
const output = rig.getStaticOutput();
expect(output).toMatch(/1\.2\.3/);
},
});
});