diff --git a/examples/smolagents_benchmark/run.py b/examples/smolagents_benchmark/run.py index f2b60eb58..424ed4a77 100644 --- a/examples/smolagents_benchmark/run.py +++ b/examples/smolagents_benchmark/run.py @@ -22,7 +22,6 @@ ToolCallingAgent, VisitWebpageTool, ) -from smolagents.agents import ActionStep load_dotenv() @@ -80,7 +79,6 @@ def parse_arguments(): parser.add_argument( "--push-answers-to-hub", action="store_true", - default=False, help="Push the answers to the hub", ) parser.add_argument( @@ -113,8 +111,15 @@ def serialize_agent_error(obj): def append_answer(entry: dict, jsonl_file: str) -> None: jsonl_file = Path(jsonl_file) jsonl_file.parent.mkdir(parents=True, exist_ok=True) + + def convert_to_serializable(obj): + if hasattr(obj, "dict"): + return obj.dict() + else: + raise TypeError(f"Object of type {type(obj)} is not JSON serializable") + with APPEND_ANSWER_LOCK, open(jsonl_file, "a", encoding="utf-8") as fp: - fp.write(json.dumps(entry) + "\n") + fp.write(json.dumps(entry, default=convert_to_serializable) + "\n") assert os.path.exists(jsonl_file), "File not found!" @@ -153,11 +158,7 @@ def answer_single_question(example, model, answers_file, action_type): # Run agent 🚀 answer = str(agent.run(augmented_question)) token_counts = agent.monitor.get_total_token_counts() - # Remove memory from logs to make them more compact. - for step in agent.memory.steps: - if isinstance(step, ActionStep): - step.agent_memory = None - intermediate_steps = str(agent.memory.steps) + intermediate_steps = [dict(message) for message in agent.write_memory_to_messages()] end_time = time.time() except Exception as e: diff --git a/examples/smolagents_benchmark/score.ipynb b/examples/smolagents_benchmark/score.ipynb index b624d802c..d81906481 100644 --- a/examples/smolagents_benchmark/score.ipynb +++ b/examples/smolagents_benchmark/score.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -23,15 +23,15 @@ "\n", "# Evaluation dataset\n", "# - the dataset is gated, so you must first visit its page to request access: https://huggingface.co/datasets/smolagents-benchmark/benchmark-v1\n", - "EVAL_DATASET = \"smolagents-benchmark/benchmark-v1\"\n", + "EVAL_DATASET = \"smolagents/benchmark-v1\"\n", "\n", "# Answers dataset: it must be a gated dataset; required to score the answers\n", - "ANSWERS_DATASET = \"smolagents-benchmark/answers\"\n", + "ANSWERS_DATASET = \"smolagents/answers\"\n", "# Whether to push the answers dataset to the Hub\n", "PUSH_ANSWERS_DATASET_TO_HUB = True\n", "\n", "# Results dataset\n", - "RESULTS_DATASET = \"smolagents-benchmark/results\"\n", + "RESULTS_DATASET = \"smolagents/results\"\n", "# Whether to push the results dataset to the Hub\n", "PUSH_RESULTS_DATASET_TO_HUB = True" ] @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -189,6 +189,18 @@ " push_to_hub_dataset=RESULTS_DATASET if PUSH_RESULTS_DATASET_TO_HUB else None,\n", " set_default=True,\n", "):\n", + " \"\"\"\n", + " Score answers from the given dataset subsets.\n", + "\n", + " Parameters:\n", + " answers_subsets: List of dataset subsets to score\n", + " answers_dataset: Dataset containing the answers\n", + " date: Date to use for the config name\n", + " push_to_hub_dataset: Dataset ID to push results to, or None to skip pushing\n", + " set_default: If True, sets this config as the default config in the Hugging Face Hub dataset.\n", + " This means when users load the dataset without specifying a config,\n", + " this version will be loaded by default.\n", + " \"\"\"\n", " if not answers_dataset:\n", " raise ValueError(\"Pass 'answers_dataset' to load the answers from it\")\n", " date = date or datetime.date.today().isoformat()\n", @@ -206,10 +218,7 @@ " if push_to_hub_dataset:\n", " ds = datasets.Dataset.from_pandas(df)\n", " config = date\n", - " set_default = set_default\n", - " ds.push_to_hub(\n", - " push_to_hub_dataset, config_name=config, set_default=set_default, commit_message=f\"Upload {config} results\"\n", - " )\n", + " ds.push_to_hub(push_to_hub_dataset, config_name=config, commit_message=f\"Upload {config} results\")\n", " return df" ] }, @@ -244,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -370,9 +379,9 @@ ], "metadata": { "kernelspec": { - "display_name": "test", + "display_name": "agents", "language": "python", - "name": "test" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/src/smolagents/models.py b/src/smolagents/models.py index 4ba0197dd..09009ae71 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -1301,7 +1301,7 @@ def generate( messages=messages, stop_sequences=stop_sequences, tools_to_call_from=tools_to_call_from, - response_format=response_format, + # response_format=response_format, convert_images_to_image_urls=True, custom_role_conversions=self.custom_role_conversions, **kwargs, diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index 5f5c174da..b3727a4b6 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -46,6 +46,13 @@ class TokenUsage: def __post_init__(self): self.total_tokens = self.input_tokens + self.output_tokens + def dict(self): + return { + "input_tokens": self.input_tokens, + "output_tokens": self.output_tokens, + "total_tokens": self.total_tokens, + } + @dataclass class Timing: