diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx index 525846ac9..c1c1cb7c9 100644 --- a/docs/source/saving-and-reading-results.mdx +++ b/docs/source/saving-and-reading-results.mdx @@ -13,6 +13,11 @@ To save the details of the evaluation, you can use the `--save-details` option. The details will be saved in a parquet file `{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet`. +If you want results to be saved in a custom path, you can set the `results-path-template` option. +This allows you to set a string template for the path. The template need to contain the following +variables: `output_dir`, `model_name`, `org`. For example +`{output_dir}/{org}_{model}`. The template will be used to create the path for the results file. + ## Pushing results to the HuggingFace hub You can push the results and evaluation details to the HuggingFace hub. To do diff --git a/src/lighteval/config/lighteval_config.py b/src/lighteval/config/lighteval_config.py index f24a15184..138afb90f 100644 --- a/src/lighteval/config/lighteval_config.py +++ b/src/lighteval/config/lighteval_config.py @@ -60,6 +60,7 @@ class LightEvalLoggingArgs: """Arguments related to logging for LightEval""" output_dir: str + results_path_template: str | None = None save_details: bool = True push_to_hub: bool = False push_to_tensorboard: bool = False diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 2694bae81..3bf5e7923 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -97,6 +97,9 @@ class EvaluationTracker: Args: output_dir (`str`): Local folder path where you want results to be saved. + results_path_template (`str`, *optional*): template to use for the results output directory. for example, + `"{output_dir}/results_this_time_it_will_work/{org}_{model}"` will create a folder named `results` in the output directory + with the model name and the organization name. save_details (`bool`, defaults to True): If True, details are saved to the `output_dir`. push_to_hub (`bool`, defaults to False): If True, details are pushed to the hub. Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset, @@ -119,6 +122,7 @@ class EvaluationTracker: def __init__( self, output_dir: str, + results_path_template: str | None = None, save_details: bool = True, push_to_hub: bool = False, push_to_tensorboard: bool = False, @@ -152,6 +156,7 @@ def __init__( self.tensorboard_repo = f"{hub_results_org}/tensorboard_logs" self.tensorboard_metric_prefix = tensorboard_metric_prefix self.nanotron_run_info = nanotron_run_info + self.results_path_template = results_path_template self.public = public @@ -259,7 +264,14 @@ def push_to_wandb(self, results_dict: dict, details_datasets: dict) -> None: self.wandb_run.finish() def save_results(self, date_id: str, results_dict: dict): - output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name + if self.results_path_template is not None: + org_model_parts = self.general_config_logger.model_name.split("/") + org = org_model_parts[0] if len(org_model_parts) >= 2 else "" + model = org_model_parts[1] if len(org_model_parts) >= 2 else org_model_parts[0] + output_dir = self.output_dir + output_dir_results = Path(self.results_path_template.format(output_dir=output_dir, org=org, model=model)) + else: + output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name self.fs.mkdirs(output_dir_results, exist_ok=True) output_results_file = output_dir_results / f"results_{date_id}.json" logger.info(f"Saving results to {output_results_file}") diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index 5255aaefc..af0b4b754 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -60,9 +60,6 @@ def accelerate( # noqa C901 custom_tasks: Annotated[ Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) ] = None, - cache_dir: Annotated[ - Optional[str], Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, num_fewshot_seeds: Annotated[ int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) ] = 1, @@ -73,6 +70,13 @@ def accelerate( # noqa C901 output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) ] = "results", + results_path_template: Annotated[ + str | None, + Option( + help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ] = None, push_to_hub: Annotated[ bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) ] = False, @@ -118,6 +122,7 @@ def accelerate( # noqa C901 evaluation_tracker = EvaluationTracker( output_dir=output_dir, + results_path_template=results_path_template, save_details=save_details, push_to_hub=push_to_hub, push_to_tensorboard=push_to_tensorboard, diff --git a/src/lighteval/main_custom.py b/src/lighteval/main_custom.py index 92d271597..4e395f7e6 100644 --- a/src/lighteval/main_custom.py +++ b/src/lighteval/main_custom.py @@ -70,6 +70,13 @@ def custom( output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2) ] = "results", + results_path_template: Annotated[ + str | None, + Option( + help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", + rich_help_panel=HELP_PANNEL_NAME_2, + ), + ] = None, push_to_hub: Annotated[ bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2) ] = False, @@ -101,6 +108,7 @@ def custom( evaluation_tracker = EvaluationTracker( output_dir=output_dir, + results_path_template=results_path_template, save_details=save_details, push_to_hub=push_to_hub, push_to_tensorboard=push_to_tensorboard, diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py index 0e7afa95c..7f706abf0 100644 --- a/src/lighteval/main_endpoint.py +++ b/src/lighteval/main_endpoint.py @@ -72,6 +72,13 @@ def inference_endpoint( output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) ] = "results", + results_path_template: Annotated[ + str | None, + Option( + help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ] = None, push_to_hub: Annotated[ bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) ] = False, @@ -111,6 +118,7 @@ def inference_endpoint( evaluation_tracker = EvaluationTracker( output_dir=output_dir, + results_path_template=results_path_template, save_details=save_details, push_to_hub=push_to_hub, push_to_tensorboard=push_to_tensorboard, @@ -185,6 +193,13 @@ def tgi( output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) ] = "results", + results_path_template: Annotated[ + str | None, + Option( + help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ] = None, push_to_hub: Annotated[ bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) ] = False, @@ -227,6 +242,7 @@ def tgi( evaluation_tracker = EvaluationTracker( output_dir=output_dir, + results_path_template=results_path_template, save_details=save_details, push_to_hub=push_to_hub, push_to_tensorboard=push_to_tensorboard, @@ -302,6 +318,13 @@ def litellm( output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) ] = "results", + results_path_template: Annotated[ + str | None, + Option( + help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ] = None, push_to_hub: Annotated[ bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) ] = False, @@ -344,6 +367,7 @@ def litellm( evaluation_tracker = EvaluationTracker( output_dir=output_dir, + results_path_template=results_path_template, save_details=save_details, push_to_hub=push_to_hub, push_to_tensorboard=push_to_tensorboard, @@ -420,6 +444,13 @@ def inference_providers( output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) ] = "results", + results_path_template: Annotated[ + str | None, + Option( + help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ] = None, push_to_hub: Annotated[ bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) ] = False, @@ -462,6 +493,7 @@ def inference_providers( evaluation_tracker = EvaluationTracker( output_dir=output_dir, + results_path_template=results_path_template, save_details=save_details, push_to_hub=push_to_hub, push_to_tensorboard=push_to_tensorboard, diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py index 94004c065..013c971e9 100644 --- a/src/lighteval/main_nanotron.py +++ b/src/lighteval/main_nanotron.py @@ -81,6 +81,7 @@ def nanotron( evaluation_tracker = EvaluationTracker( output_dir=lighteval_config.logging.output_dir, + results_path_template=lighteval_config.logging.results_path_template, hub_results_org=lighteval_config.logging.results_org, public=lighteval_config.logging.public_run, push_to_hub=lighteval_config.logging.push_to_hub, diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py index 43b16ca6a..4edc5f3e5 100644 --- a/src/lighteval/main_sglang.py +++ b/src/lighteval/main_sglang.py @@ -63,6 +63,13 @@ def sglang( output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) ] = "results", + results_path_template: Annotated[ + str | None, + Option( + help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ] = None, push_to_hub: Annotated[ bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) ] = False, @@ -104,6 +111,7 @@ def sglang( evaluation_tracker = EvaluationTracker( output_dir=output_dir, + results_path_template=results_path_template, save_details=save_details, push_to_hub=push_to_hub, push_to_tensorboard=push_to_tensorboard, diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py index 31b37b100..b2eb26e15 100644 --- a/src/lighteval/main_vllm.py +++ b/src/lighteval/main_vllm.py @@ -66,6 +66,13 @@ def vllm( output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) ] = "results", + results_path_template: Annotated[ + str | None, + Option( + help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ] = None, push_to_hub: Annotated[ bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) ] = False, @@ -107,6 +114,7 @@ def vllm( evaluation_tracker = EvaluationTracker( output_dir=output_dir, + results_path_template=results_path_template, save_details=save_details, push_to_hub=push_to_hub, push_to_tensorboard=push_to_tensorboard, diff --git a/tests/logging/test_evaluation_tracker.py b/tests/logging/test_evaluation_tracker.py index aaa8f9d1c..e1abdb924 100644 --- a/tests/logging/test_evaluation_tracker.py +++ b/tests/logging/test_evaluation_tracker.py @@ -96,6 +96,30 @@ def test_results_logging(mock_evaluation_tracker: EvaluationTracker): assert saved_results["config_general"]["model_name"] == "test_model" +def test_results_logging_template(mock_evaluation_tracker: EvaluationTracker): + task_metrics = { + "task1": {"accuracy": 0.8, "f1": 0.75}, + "task2": {"precision": 0.9, "recall": 0.85}, + } + mock_evaluation_tracker.metrics_logger.metric_aggregated = task_metrics + mock_evaluation_tracker.results_path_template = "{output_dir}/{org}_{model}" + + mock_evaluation_tracker.save() + + results_dir = Path(mock_evaluation_tracker.output_dir) / "_test_model" + assert results_dir.exists() + + result_files = list(results_dir.glob("results_*.json")) + assert len(result_files) == 1 + + with open(result_files[0], "r") as f: + saved_results = json.load(f) + + assert "results" in saved_results + assert saved_results["results"] == task_metrics + assert saved_results["config_general"]["model_name"] == "test_model" + + @pytest.mark.evaluation_tracker(save_details=True) def test_details_logging(mock_evaluation_tracker, mock_datetime): task_details = {