Skip to content

Commit b5818fa

Browse files
Fix smolagents benchmark (#1377)
Co-authored-by: Albert Villanova del Moral <[email protected]>
1 parent ce8e3d6 commit b5818fa

File tree

4 files changed

+38
-21
lines changed

4 files changed

+38
-21
lines changed

examples/smolagents_benchmark/run.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
ToolCallingAgent,
2323
VisitWebpageTool,
2424
)
25-
from smolagents.agents import ActionStep
2625

2726

2827
load_dotenv()
@@ -80,7 +79,6 @@ def parse_arguments():
8079
parser.add_argument(
8180
"--push-answers-to-hub",
8281
action="store_true",
83-
default=False,
8482
help="Push the answers to the hub",
8583
)
8684
parser.add_argument(
@@ -113,8 +111,15 @@ def serialize_agent_error(obj):
113111
def append_answer(entry: dict, jsonl_file: str) -> None:
114112
jsonl_file = Path(jsonl_file)
115113
jsonl_file.parent.mkdir(parents=True, exist_ok=True)
114+
115+
def convert_to_serializable(obj):
116+
if hasattr(obj, "dict"):
117+
return obj.dict()
118+
else:
119+
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
120+
116121
with APPEND_ANSWER_LOCK, open(jsonl_file, "a", encoding="utf-8") as fp:
117-
fp.write(json.dumps(entry) + "\n")
122+
fp.write(json.dumps(entry, default=convert_to_serializable) + "\n")
118123
assert os.path.exists(jsonl_file), "File not found!"
119124

120125

@@ -153,11 +158,7 @@ def answer_single_question(example, model, answers_file, action_type):
153158
# Run agent 🚀
154159
answer = str(agent.run(augmented_question))
155160
token_counts = agent.monitor.get_total_token_counts()
156-
# Remove memory from logs to make them more compact.
157-
for step in agent.memory.steps:
158-
if isinstance(step, ActionStep):
159-
step.agent_memory = None
160-
intermediate_steps = str(agent.memory.steps)
161+
intermediate_steps = [dict(message) for message in agent.write_memory_to_messages()]
161162

162163
end_time = time.time()
163164
except Exception as e:

examples/smolagents_benchmark/score.ipynb

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
},
1212
{
1313
"cell_type": "code",
14-
"execution_count": null,
14+
"execution_count": 17,
1515
"metadata": {},
1616
"outputs": [],
1717
"source": [
@@ -23,15 +23,15 @@
2323
"\n",
2424
"# Evaluation dataset\n",
2525
"# - the dataset is gated, so you must first visit its page to request access: https://huggingface.co/datasets/smolagents-benchmark/benchmark-v1\n",
26-
"EVAL_DATASET = \"smolagents-benchmark/benchmark-v1\"\n",
26+
"EVAL_DATASET = \"smolagents/benchmark-v1\"\n",
2727
"\n",
2828
"# Answers dataset: it must be a gated dataset; required to score the answers\n",
29-
"ANSWERS_DATASET = \"smolagents-benchmark/answers\"\n",
29+
"ANSWERS_DATASET = \"smolagents/answers\"\n",
3030
"# Whether to push the answers dataset to the Hub\n",
3131
"PUSH_ANSWERS_DATASET_TO_HUB = True\n",
3232
"\n",
3333
"# Results dataset\n",
34-
"RESULTS_DATASET = \"smolagents-benchmark/results\"\n",
34+
"RESULTS_DATASET = \"smolagents/results\"\n",
3535
"# Whether to push the results dataset to the Hub\n",
3636
"PUSH_RESULTS_DATASET_TO_HUB = True"
3737
]
@@ -45,7 +45,7 @@
4545
},
4646
{
4747
"cell_type": "code",
48-
"execution_count": null,
48+
"execution_count": 18,
4949
"metadata": {},
5050
"outputs": [],
5151
"source": [
@@ -189,6 +189,18 @@
189189
" push_to_hub_dataset=RESULTS_DATASET if PUSH_RESULTS_DATASET_TO_HUB else None,\n",
190190
" set_default=True,\n",
191191
"):\n",
192+
" \"\"\"\n",
193+
" Score answers from the given dataset subsets.\n",
194+
"\n",
195+
" Parameters:\n",
196+
" answers_subsets: List of dataset subsets to score\n",
197+
" answers_dataset: Dataset containing the answers\n",
198+
" date: Date to use for the config name\n",
199+
" push_to_hub_dataset: Dataset ID to push results to, or None to skip pushing\n",
200+
" set_default: If True, sets this config as the default config in the Hugging Face Hub dataset.\n",
201+
" This means when users load the dataset without specifying a config,\n",
202+
" this version will be loaded by default.\n",
203+
" \"\"\"\n",
192204
" if not answers_dataset:\n",
193205
" raise ValueError(\"Pass 'answers_dataset' to load the answers from it\")\n",
194206
" date = date or datetime.date.today().isoformat()\n",
@@ -206,10 +218,7 @@
206218
" if push_to_hub_dataset:\n",
207219
" ds = datasets.Dataset.from_pandas(df)\n",
208220
" config = date\n",
209-
" set_default = set_default\n",
210-
" ds.push_to_hub(\n",
211-
" push_to_hub_dataset, config_name=config, set_default=set_default, commit_message=f\"Upload {config} results\"\n",
212-
" )\n",
221+
" ds.push_to_hub(push_to_hub_dataset, config_name=config, commit_message=f\"Upload {config} results\")\n",
213222
" return df"
214223
]
215224
},
@@ -244,7 +253,7 @@
244253
},
245254
{
246255
"cell_type": "code",
247-
"execution_count": null,
256+
"execution_count": 12,
248257
"metadata": {},
249258
"outputs": [],
250259
"source": [
@@ -370,9 +379,9 @@
370379
],
371380
"metadata": {
372381
"kernelspec": {
373-
"display_name": "test",
382+
"display_name": "agents",
374383
"language": "python",
375-
"name": "test"
384+
"name": "python3"
376385
},
377386
"language_info": {
378387
"codemirror_mode": {

src/smolagents/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1301,7 +1301,7 @@ def generate(
13011301
messages=messages,
13021302
stop_sequences=stop_sequences,
13031303
tools_to_call_from=tools_to_call_from,
1304-
response_format=response_format,
1304+
# response_format=response_format,
13051305
convert_images_to_image_urls=True,
13061306
custom_role_conversions=self.custom_role_conversions,
13071307
**kwargs,

src/smolagents/monitoring.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,13 @@ class TokenUsage:
4646
def __post_init__(self):
4747
self.total_tokens = self.input_tokens + self.output_tokens
4848

49+
def dict(self):
50+
return {
51+
"input_tokens": self.input_tokens,
52+
"output_tokens": self.output_tokens,
53+
"total_tokens": self.total_tokens,
54+
}
55+
4956

5057
@dataclass
5158
class Timing:

0 commit comments

Comments
 (0)