-
Notifications
You must be signed in to change notification settings - Fork 20
Expand file tree
/
Copy pathcli.py
More file actions
232 lines (183 loc) · 6 KB
/
Copy pathcli.py
File metadata and controls
232 lines (183 loc) · 6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# gittaskbench/cli.py
import sys
import argparse
import logging
from typing import List, Optional
from pathlib import Path
from gittaskbench import __version__
from gittaskbench.utils import logger, setup_logger, find_project_root
from gittaskbench.task_loader import load_task
from gittaskbench.evaluator import run_evaluation
from gittaskbench.result_analyzer import analyze_results
def grade_command(args: argparse.Namespace) -> int:
"""
Handle the 'grade' subcommand.
Args:
args: Parsed command-line arguments
Returns:
Exit code (0 for success, non-zero for failure)
"""
# Set log level based on verbosity
if args.verbose:
logger.setLevel(logging.DEBUG)
if args.all:
project_root = find_project_root()
config_dir = project_root / "config"
if not config_dir.exists():
logger.error(f"Config directory not found: {config_dir}")
return 1
all_task_ids = []
for task_dir in config_dir.iterdir():
if task_dir.is_dir():
task_id = task_dir.name
all_task_ids.append(task_id)
overall_success = True
for task_id in all_task_ids:
logger.info(f"Loading task: {task_id}")
task = load_task(task_id, args.output_dir, args.result)
if not task:
logger.error(f"Failed to load task: {task_id}")
overall_success = False
continue
success = run_evaluation(task)
if not success:
overall_success = False
return 0 if overall_success else 1
else:
if not args.taskid:
logger.error("The --taskid argument is required when --all is false.")
return 1
logger.info(f"Loading task: {args.taskid}")
# Load task information
task = load_task(args.taskid, args.output_dir, args.result)
if not task:
logger.error(f"Failed to load task: {args.taskid}")
return 1
# Run evaluation
success = run_evaluation(task)
return 0 if success else 1
def eval_command(args: argparse.Namespace) -> int:
"""
Handle the 'eval' subcommand.
Args:
args: Parsed command-line arguments
Returns:
Exit code (0 for success, non-zero for failure)
"""
# Set log level based on verbosity
if args.verbose:
logger.setLevel(logging.DEBUG)
project_root = find_project_root()
result_dir = project_root / (args.result if args.result else "test_results")
if not result_dir.exists():
logger.error(f"Result directory not found: {result_dir}")
return 1
# 如果没有指定 output_file 参数,使用默认文件名
if args.output_file is None:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
output_file = Path(f"evaluation_report_{timestamp}.txt")
else:
output_file = Path(args.output_file)
# Analyze results
analyze_results(result_dir, output_file)
return 0
def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
"""
Parse command-line arguments.
Args:
args: Command-line arguments to parse (defaults to sys.argv[1:])
Returns:
Parsed arguments namespace
"""
parser = argparse.ArgumentParser(
prog="gittaskbench",
description="GitTaskBench - A tool for benchmarking agent tasks"
)
parser.add_argument(
'--version',
action='version',
version=f'%(prog)s {__version__}'
)
parser.add_argument(
'-v', '--verbose',
action='store_true',
help='Enable verbose output'
)
# Create subparsers for different commands
subparsers = parser.add_subparsers(
title='commands',
dest='command',
help='Command to execute'
)
# Grade command
grade_parser = subparsers.add_parser(
'grade',
help='Grade a task completion'
)
grade_parser.add_argument(
'--taskid',
help='Task ID to evaluate'
)
grade_parser.add_argument(
'--output_dir',
help='Directory containing agent output (overrides config file)'
)
grade_parser.add_argument(
'--result',
help='Directory to store the result file. If provided, overrides the config file.'
)
grade_parser.add_argument(
'--all',
action='store_true',
default=False,
help='Run evaluation for all tasks. Default is false.'
)
# Set the handler for the grade command
grade_parser.set_defaults(func=grade_command)
# Eval command
eval_parser = subparsers.add_parser(
'eval',
help='Evaluate results from a directory'
)
eval_parser.add_argument(
'--result',
default="test_results",
help='Directory containing the result files. Defaults to test_results.'
)
# add output_file
eval_parser.add_argument(
'--output_file',
help='File path to write the evaluation report to. If not provided, a default file will be created in the current directory.'
)
# Set the handler for the eval command
eval_parser.set_defaults(func=eval_command)
# Parse arguments
parsed_args = parser.parse_args(args)
# Check if a command was provided
if not parsed_args.command:
parser.print_help()
sys.exit(1)
return parsed_args
def main() -> int:
"""
Main entry point for the CLI.
Returns:
Exit code (0 for success, non-zero for failure)
"""
try:
args = parse_args()
# Call the appropriate command handler
if hasattr(args, 'func'):
return args.func(args)
else:
logger.error("No command specified")
return 1
except Exception as e:
logger.critical(f"Unexpected error: {str(e)}")
if logger.level == logging.DEBUG:
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())