Fix smolagents benchmark (#1377)

aymeric-roucher · albertvillanova · web-flow · commit b5818fae05b8 · 2025-05-26T16:04:21.000+02:00
Co-authored-by: Albert Villanova del Moral &lt;8515462+albertvillanova@users.noreply.github.com&gt;
diff --git a/examples/smolagents_benchmark/run.py b/examples/smolagents_benchmark/run.py
@@ -22,7 +22,6 @@
     ToolCallingAgent,
     VisitWebpageTool,
 )
-from smolagents.agents import ActionStep
 
 
 load_dotenv()
@@ -80,7 +79,6 @@ def parse_arguments():
     parser.add_argument(
         "--push-answers-to-hub",
         action="store_true",
-        default=False,
         help="Push the answers to the hub",
     )
     parser.add_argument(
@@ -113,8 +111,15 @@ def serialize_agent_error(obj):
 def append_answer(entry: dict, jsonl_file: str) -> None:
     jsonl_file = Path(jsonl_file)
     jsonl_file.parent.mkdir(parents=True, exist_ok=True)
+
+    def convert_to_serializable(obj):
+        if hasattr(obj, "dict"):
+            return obj.dict()
+        else:
+            raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
+
     with APPEND_ANSWER_LOCK, open(jsonl_file, "a", encoding="utf-8") as fp:
-        fp.write(json.dumps(entry) + "\n")
+        fp.write(json.dumps(entry, default=convert_to_serializable) + "\n")
     assert os.path.exists(jsonl_file), "File not found!"
 
 
@@ -153,11 +158,7 @@ def answer_single_question(example, model, answers_file, action_type):
             # Run agent 🚀
             answer = str(agent.run(augmented_question))
             token_counts = agent.monitor.get_total_token_counts()
-            # Remove memory from logs to make them more compact.
-            for step in agent.memory.steps:
-                if isinstance(step, ActionStep):
-                    step.agent_memory = None
-            intermediate_steps = str(agent.memory.steps)
+            intermediate_steps = [dict(message) for message in agent.write_memory_to_messages()]
 
         end_time = time.time()
     except Exception as e:
diff --git a/examples/smolagents_benchmark/score.ipynb b/examples/smolagents_benchmark/score.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -23,15 +23,15 @@
     "\n",
     "# Evaluation dataset\n",
     "# - the dataset is gated, so you must first visit its page to request access: https://huggingface.co/datasets/smolagents-benchmark/benchmark-v1\n",
-    "EVAL_DATASET = \"smolagents-benchmark/benchmark-v1\"\n",
+    "EVAL_DATASET = \"smolagents/benchmark-v1\"\n",
     "\n",
     "# Answers dataset: it must be a gated dataset; required to score the answers\n",
-    "ANSWERS_DATASET = \"smolagents-benchmark/answers\"\n",
+    "ANSWERS_DATASET = \"smolagents/answers\"\n",
     "# Whether to push the answers dataset to the Hub\n",
     "PUSH_ANSWERS_DATASET_TO_HUB = True\n",
     "\n",
     "# Results dataset\n",
-    "RESULTS_DATASET = \"smolagents-benchmark/results\"\n",
+    "RESULTS_DATASET = \"smolagents/results\"\n",
     "# Whether to push the results dataset to the Hub\n",
     "PUSH_RESULTS_DATASET_TO_HUB = True"
    ]
@@ -45,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,6 +189,18 @@
     "    push_to_hub_dataset=RESULTS_DATASET if PUSH_RESULTS_DATASET_TO_HUB else None,\n",
     "    set_default=True,\n",
     "):\n",
+    "    \"\"\"\n",
+    "    Score answers from the given dataset subsets.\n",
+    "\n",
+    "    Parameters:\n",
+    "        answers_subsets: List of dataset subsets to score\n",
+    "        answers_dataset: Dataset containing the answers\n",
+    "        date: Date to use for the config name\n",
+    "        push_to_hub_dataset: Dataset ID to push results to, or None to skip pushing\n",
+    "        set_default: If True, sets this config as the default config in the Hugging Face Hub dataset.\n",
+    "                     This means when users load the dataset without specifying a config,\n",
+    "                     this version will be loaded by default.\n",
+    "    \"\"\"\n",
     "    if not answers_dataset:\n",
     "        raise ValueError(\"Pass 'answers_dataset' to load the answers from it\")\n",
     "    date = date or datetime.date.today().isoformat()\n",
@@ -206,10 +218,7 @@
     "    if push_to_hub_dataset:\n",
     "        ds = datasets.Dataset.from_pandas(df)\n",
     "        config = date\n",
-    "        set_default = set_default\n",
-    "        ds.push_to_hub(\n",
-    "            push_to_hub_dataset, config_name=config, set_default=set_default, commit_message=f\"Upload {config} results\"\n",
-    "        )\n",
+    "        ds.push_to_hub(push_to_hub_dataset, config_name=config, commit_message=f\"Upload {config} results\")\n",
     "    return df"
    ]
   },
@@ -244,7 +253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -370,9 +379,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "test",
+   "display_name": "agents",
    "language": "python",
-   "name": "test"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
@@ -1301,7 +1301,7 @@ def generate(
             messages=messages,
             stop_sequences=stop_sequences,
             tools_to_call_from=tools_to_call_from,
-            response_format=response_format,
+            # response_format=response_format,
             convert_images_to_image_urls=True,
             custom_role_conversions=self.custom_role_conversions,
             **kwargs,
diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
@@ -46,6 +46,13 @@ class TokenUsage:
     def __post_init__(self):
         self.total_tokens = self.input_tokens + self.output_tokens
 
+    def dict(self):
+        return {
+            "input_tokens": self.input_tokens,
+            "output_tokens": self.output_tokens,
+            "total_tokens": self.total_tokens,
+        }
+
 
 @dataclass
 class Timing: