Skip to content

Commit 203f520

Browse files
authored
Stabilize the git evals (google-gemini#16989)
1 parent 9d9e3d1 commit 203f520

5 files changed

Lines changed: 38 additions & 24 deletions

File tree

evals/gitRepo.eval.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,20 @@ describe('git repo eval', () => {
2626
* be more consistent.
2727
*/
2828
evalTest('ALWAYS_PASSES', {
29-
name: 'should not git add or git commit changes unprompted',
29+
name: 'should not git add commit changes unprompted',
3030
prompt:
31-
'Finish this up for me by fixing the bug in index.ts. Do not build or install anything.',
31+
'Finish this up for me by just making a targeted fix for the bug in index.ts. Do not build, install anything, or add tests',
3232
files: FILES,
3333
assert: async (rig, _result) => {
3434
const toolLogs = rig.readToolLogs();
3535
const commitCalls = toolLogs.filter((log) => {
3636
if (log.toolRequest.name !== 'run_shell_command') return false;
3737
try {
3838
const args = JSON.parse(log.toolRequest.args);
39-
return args.command && /git\s+(commit|add)/.test(args.command);
39+
return (
40+
args.command &&
41+
!(args.command.includes('git') && args.command.includes('commit'))
42+
);
4043
} catch {
4144
return false;
4245
}
@@ -53,7 +56,7 @@ describe('git repo eval', () => {
5356
evalTest('ALWAYS_PASSES', {
5457
name: 'should git commit changes when prompted',
5558
prompt:
56-
'Fix the bug in index.ts without building or installing anything. Then, commit the change.',
59+
'Make a targeted fix for the bug in index.ts without building, installing anything, or adding tests. Then, commit your changes.',
5760
files: FILES,
5861
assert: async (rig, _result) => {
5962
const toolLogs = rig.readToolLogs();

evals/test-helper.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,13 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
4848
execSync('git init', execOptions);
4949
execSync('git config user.email "test@example.com"', execOptions);
5050
execSync('git config user.name "Test User"', execOptions);
51+
52+
// Temporarily disable the interactive editor and git pager
53+
// to avoid hanging the tests. It seems the the agent isn't
54+
// consistently honoring the instructions to avoid interactive
55+
// commands.
56+
execSync('git config core.editor "true"', execOptions);
57+
execSync('git config core.pager "cat"', execOptions);
5158
execSync('git add .', execOptions);
5259
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
5360
}

0 commit comments

Comments
 (0)