Skip to content

Fix smolagents benchmark #1377

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 26, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions examples/smolagents_benchmark/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
ToolCallingAgent,
VisitWebpageTool,
)
from smolagents.agents import ActionStep


load_dotenv()
Expand Down Expand Up @@ -80,7 +79,7 @@ def parse_arguments():
parser.add_argument(
"--push-answers-to-hub",
action="store_true",
default=False,
default=True,
help="Push the answers to the hub",
)
parser.add_argument(
Expand Down Expand Up @@ -113,8 +112,15 @@ def serialize_agent_error(obj):
def append_answer(entry: dict, jsonl_file: str) -> None:
jsonl_file = Path(jsonl_file)
jsonl_file.parent.mkdir(parents=True, exist_ok=True)

def convert_to_serializable(obj):
if hasattr(obj, "dict"):
return obj.dict()
else:
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

with APPEND_ANSWER_LOCK, open(jsonl_file, "a", encoding="utf-8") as fp:
fp.write(json.dumps(entry) + "\n")
fp.write(json.dumps(entry, default=convert_to_serializable) + "\n")
assert os.path.exists(jsonl_file), "File not found!"


Expand Down Expand Up @@ -153,11 +159,7 @@ def answer_single_question(example, model, answers_file, action_type):
# Run agent 🚀
answer = str(agent.run(augmented_question))
token_counts = agent.monitor.get_total_token_counts()
# Remove memory from logs to make them more compact.
for step in agent.memory.steps:
if isinstance(step, ActionStep):
step.agent_memory = None
intermediate_steps = str(agent.memory.steps)
intermediate_steps = [dict(message) for message in agent.write_memory_to_messages()]

end_time = time.time()
except Exception as e:
Expand Down
33 changes: 21 additions & 12 deletions examples/smolagents_benchmark/score.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -23,15 +23,15 @@
"\n",
"# Evaluation dataset\n",
"# - the dataset is gated, so you must first visit its page to request access: https://huggingface.co/datasets/smolagents-benchmark/benchmark-v1\n",
"EVAL_DATASET = \"smolagents-benchmark/benchmark-v1\"\n",
"EVAL_DATASET = \"smolagents/benchmark-v1\"\n",
"\n",
"# Answers dataset: it must be a gated dataset; required to score the answers\n",
"ANSWERS_DATASET = \"smolagents-benchmark/answers\"\n",
"ANSWERS_DATASET = \"smolagents/answers\"\n",
"# Whether to push the answers dataset to the Hub\n",
"PUSH_ANSWERS_DATASET_TO_HUB = True\n",
"\n",
"# Results dataset\n",
"RESULTS_DATASET = \"smolagents-benchmark/results\"\n",
"RESULTS_DATASET = \"smolagents/results\"\n",
"# Whether to push the results dataset to the Hub\n",
"PUSH_RESULTS_DATASET_TO_HUB = True"
]
Expand All @@ -45,7 +45,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -189,6 +189,18 @@
" push_to_hub_dataset=RESULTS_DATASET if PUSH_RESULTS_DATASET_TO_HUB else None,\n",
" set_default=True,\n",
"):\n",
" \"\"\"\n",
" Score answers from the given dataset subsets.\n",
"\n",
" Parameters:\n",
" answers_subsets: List of dataset subsets to score\n",
" answers_dataset: Dataset containing the answers\n",
" date: Date to use for the config name\n",
" push_to_hub_dataset: Dataset ID to push results to, or None to skip pushing\n",
" set_default: If True, sets this config as the default config in the Hugging Face Hub dataset.\n",
" This means when users load the dataset without specifying a config,\n",
" this version will be loaded by default.\n",
" \"\"\"\n",
" if not answers_dataset:\n",
" raise ValueError(\"Pass 'answers_dataset' to load the answers from it\")\n",
" date = date or datetime.date.today().isoformat()\n",
Expand All @@ -206,10 +218,7 @@
" if push_to_hub_dataset:\n",
" ds = datasets.Dataset.from_pandas(df)\n",
" config = date\n",
" set_default = set_default\n",
" ds.push_to_hub(\n",
" push_to_hub_dataset, config_name=config, set_default=set_default, commit_message=f\"Upload {config} results\"\n",
" )\n",
" ds.push_to_hub(push_to_hub_dataset, config_name=config, commit_message=f\"Upload {config} results\")\n",
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@albertvillanova removed the set_default argument, because for me it was trying to pop the default from a previous dataset, which didn't exist, thus raised an error. But there's probably a better way to fix this issue?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am trying to figure out what the error was, when it was generated, and how to fix it...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Until the fix is released, I would suggest not passing set_default, as you did.

" return df"
]
},
Expand Down Expand Up @@ -244,7 +253,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -370,9 +379,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "test",
"display_name": "agents",
"language": "python",
"name": "test"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand Down
2 changes: 1 addition & 1 deletion src/smolagents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1301,7 +1301,7 @@ def generate(
messages=messages,
stop_sequences=stop_sequences,
tools_to_call_from=tools_to_call_from,
response_format=response_format,
# response_format=response_format,
convert_images_to_image_urls=True,
custom_role_conversions=self.custom_role_conversions,
**kwargs,
Expand Down
7 changes: 7 additions & 0 deletions src/smolagents/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ class TokenUsage:
def __post_init__(self):
self.total_tokens = self.input_tokens + self.output_tokens

def dict(self):
return {
"input_tokens": self.input_tokens,
"output_tokens": self.output_tokens,
"total_tokens": self.total_tokens,
}


@dataclass
class Timing:
Expand Down