diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0551f9155..b832c5a5c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,7 +34,7 @@ repos: - repo: https://github.com/charliermarsh/ruff-pre-commit # Ruff version. - rev: 'v0.2.2' + rev: 'v0.11.10' hooks: - id: ruff args: ['--fix'] diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 31b1b8752..55165074a 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -26,6 +26,7 @@ This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. """ + import random import re from typing import Any, Dict, List, Optional, Union diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index 2fd85f69b..49010098c 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -26,6 +26,7 @@ This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. """ + import re from dataclasses import asdict from typing import Dict, List, Tuple diff --git a/pyproject.toml b/pyproject.toml index ebca5bf17..931846b74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ nanotron = [ ] tensorboardX = ["tensorboardX"] vllm = ["vllm>=0.7.0", "ray", "more_itertools"] -quality = ["ruff==v0.2.2","pre-commit"] +quality = ["ruff>=v0.11.0","pre-commit"] tests = ["pytest==7.4.0","deepdiff"] dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"] docs = ["hf-doc-builder", "watchdog"] diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 3bf5e7923..e654adcbc 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -603,7 +603,7 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901 f"To load the details from a run, you can for instance do the following:\n" f'```python\nfrom datasets import load_dataset\ndata = load_dataset("{repo_id}",\n\t"{sanitized_task}",\n\tsplit="train")\n```\n\n' f"## Latest results\n\n" - f'These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace("/resolve/", "/blob/")})' + f"These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace('/resolve/', '/blob/')})" f"(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. " f'You find each in the results and the "latest" split for each eval):\n\n' f"```python\n{results_string}\n```", diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 2305332cd..9f8c9b3c4 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -556,7 +556,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = if len(list_of_subtasks) > 1: metrics = list(self.metric_aggregated[list_of_subtasks[0]].keys()) self.metric_aggregated[average_task] = { - metric: sum([self.metric_aggregated[k][metric] for k in list_of_subtasks]) / len(list_of_subtasks) + metric: sum(self.metric_aggregated[k][metric] for k in list_of_subtasks) / len(list_of_subtasks) for metric in metrics } diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py index 1012bc3f7..80ee47357 100644 --- a/src/lighteval/metrics/imports/bert_scorer.py +++ b/src/lighteval/metrics/imports/bert_scorer.py @@ -22,6 +22,7 @@ # SOFTWARE. """Simplified version of the BertScorer lib - we only import what we need.""" + import logging import os import time diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index d383d61f9..3b7e4d537 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -127,7 +127,7 @@ def __init__( if self.backend == "inference-providers" and self.hf_provider is None: raise ValueError("When using 'inference-providers' as backend, you must specify an 'hf_provider'") - def __lazy_load_client(self): + def __lazy_load_client(self): # noqa: C901 match self.backend: # Both "openai" and "tgi" backends use the OpenAI-compatible API # They are handled separately to allow for backend-specific validation and setup diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 360d55487..efc762dec 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -624,16 +624,16 @@ class Metrics(Enum): sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True).compute, category=MetricCategory.GENERATIVE_SAMPLING, use_case=MetricUseCase.REASONING, - corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, - higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, + corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), + higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), ) g_pass_at_8_16 = SampleLevelMetricGrouping( metric_name="G-Pass@8-16:48_samples", sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True).compute, category=MetricCategory.GENERATIVE_SAMPLING, use_case=MetricUseCase.REASONING, - corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, - higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, + corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), + higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), ) g_pass_at_16_expr_gold = SampleLevelMetricGrouping( metric_name="G-Pass@16:48_samples", @@ -653,8 +653,8 @@ class Metrics(Enum): ).compute, category=MetricCategory.GENERATIVE_SAMPLING, use_case=MetricUseCase.REASONING, - corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, - higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, + corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), + higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), ) g_pass_at_16_latex_gold = SampleLevelMetricGrouping( metric_name="G-Pass@16:48_samples", @@ -674,8 +674,8 @@ class Metrics(Enum): ).compute, category=MetricCategory.GENERATIVE_SAMPLING, use_case=MetricUseCase.REASONING, - corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, - higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics}, + corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean), + higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True), ) perfect_exact_match = SampleLevelMetric( metric_name="perfect_em", diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 7c12e12ef..030725a53 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -24,6 +24,7 @@ Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus. A number of these aggregations come from the EleutherAIHarness """ + import logging import math from typing import Literal diff --git a/src/lighteval/metrics/stderr.py b/src/lighteval/metrics/stderr.py index cc9c6febb..e8d47e444 100644 --- a/src/lighteval/metrics/stderr.py +++ b/src/lighteval/metrics/stderr.py @@ -42,7 +42,7 @@ def _stddev(arr): mu = np.mean(arr) - return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1)) + return math.sqrt(sum((x - mu) ** 2 for x in arr) / (len(arr) - 1)) def mean_stderr(arr): diff --git a/src/lighteval/metrics/utils/math_comparison.py b/src/lighteval/metrics/utils/math_comparison.py index e90f53f7b..6592bb8d8 100644 --- a/src/lighteval/metrics/utils/math_comparison.py +++ b/src/lighteval/metrics/utils/math_comparison.py @@ -374,7 +374,7 @@ def are_flipped_inequalities_equal(a: Relational, b: Relational) -> bool: # Same type of relation (e.g. both <= or both >=) try: - if type(gold) == type(pred) and sympy_expr_eq(gold.lhs - gold.rhs, pred.lhs - pred.rhs, precision): # type: ignore + if type(gold) is type(pred) and sympy_expr_eq(gold.lhs - gold.rhs, pred.lhs - pred.rhs, precision): # type: ignore return True except TimeoutError: raise diff --git a/src/lighteval/models/endpoints/inference_providers_model.py b/src/lighteval/models/endpoints/inference_providers_model.py index f7dc13943..42a8b50a8 100644 --- a/src/lighteval/models/endpoints/inference_providers_model.py +++ b/src/lighteval/models/endpoints/inference_providers_model.py @@ -157,9 +157,9 @@ async def __call_api_parallel( results = [] num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples - assert len(prompts) == len( - num_sampless - ), f"Length of prompts and max_new_tokenss should be the same but are {len(prompts)}, {len(num_sampless)}" + assert len(prompts) == len(num_sampless), ( + f"Length of prompts and max_new_tokenss should be the same but are {len(prompts)}, {len(num_sampless)}" + ) async def bounded_api_call(prompt, num_samples): async with self.semaphore: diff --git a/src/lighteval/models/endpoints/openai_model.py b/src/lighteval/models/endpoints/openai_model.py index 42771b1ae..6a6e20c29 100644 --- a/src/lighteval/models/endpoints/openai_model.py +++ b/src/lighteval/models/endpoints/openai_model.py @@ -147,9 +147,9 @@ def __call_api_parallel( num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples logit_biass = [logit_bias for _ in prompts] if logit_bias is None else logit_bias - assert ( - len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(logit_biass) - ), "Length of prompts, return_logitss, max_new_tokenss, num_sampless, logit_biass should be same" + assert len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(logit_biass), ( + "Length of prompts, return_logitss, max_new_tokenss, num_sampless, logit_biass should be same" + ) with ThreadPoolExecutor(self.CONCURENT_CALLS) as executor: for entry in tqdm( @@ -255,11 +255,11 @@ def _loglikelihood_tokens( inputs = [sample.context for sample in split] max_new_tokens = [len(sample.tokenized_continuation) for sample in split] - assert all( - new_tokens == 1 for new_tokens in max_new_tokens - ), "Only single token continuations are supported when using openai API." + assert all(new_tokens == 1 for new_tokens in max_new_tokens), ( + "Only single token continuations are supported when using openai API." + ) - logit_biases = [{tok: 100 for tok in sample.tokenized_continuation} for sample in split] + logit_biases = [dict.fromkeys(sample.tokenized_continuation, 100) for sample in split] outputs = self.__call_api_parallel( inputs, return_logits=True, max_new_tokens=max_new_tokens, num_samples=1, logit_bias=logit_biases diff --git a/src/lighteval/models/litellm_model.py b/src/lighteval/models/litellm_model.py index 02455c134..b680d3d84 100644 --- a/src/lighteval/models/litellm_model.py +++ b/src/lighteval/models/litellm_model.py @@ -185,7 +185,9 @@ def __call_api_parallel( stop_sequencess = [stop_sequence for _ in prompts] assert ( len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(stop_sequencess) - ), f"Length of prompts, return_logitss, max_new_tokenss, num_sampless, stop_sequences, system_prompts should be the same but are {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, {len(num_sampless)}, {len(stop_sequencess)}" + ), ( + f"Length of prompts, return_logitss, max_new_tokenss, num_sampless, stop_sequences, system_prompts should be the same but are {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, {len(num_sampless)}, {len(stop_sequencess)}" + ) with ThreadPoolExecutor(self.CONCURENT_CALLS) as executor: for entry in tqdm( diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py index 76a81f6f8..373a70d09 100644 --- a/src/lighteval/models/nanotron/nanotron_model.py +++ b/src/lighteval/models/nanotron/nanotron_model.py @@ -486,9 +486,9 @@ def prepare_batch( We truncate to keep only at most `max_context` tokens We pad to `padding_length` tokens """ - assert ( - full_attention_masks is False - ), "full_attention_masks=True means we would be doing attention of padding tokens, which would affect negatively the results." + assert full_attention_masks is False, ( + "full_attention_masks=True means we would be doing attention of padding tokens, which would affect negatively the results." + ) assert pad_on_left is False, "pad_on_left=True not supported yet, see TODOs below" current_pp_rank = dist.get_rank(self.parallel_context.pp_pg) @@ -505,9 +505,9 @@ def prepare_batch( if max_context is None: max_context = self.max_length - assert ( - self.parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE - ), "No reason to have tp_mode==REDUCE_SCATTER when doing inference" + assert self.parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE, ( + "No reason to have tp_mode==REDUCE_SCATTER when doing inference" + ) # if max_context % self.parallel_config.tp != 0: # # We need to round up to the next multiple of self.parallel_config.tp # if (max_context + (self.parallel_config.tp - max_context % self.parallel_config.tp)) < self.max_length: @@ -860,9 +860,9 @@ def _loglikelihood_single_token( # print(f"i {i} padded: {r.padded}") if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank: - assert ( - len(res) == total_length - ), f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})" + assert len(res) == total_length, ( + f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})" + ) if len(res) == 0: # We are in a process which return no output (beginning/middle of the PP group) @@ -1338,9 +1338,9 @@ def greedy_until( res = res[: len(res) - to_remove_at_the_end] if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank: - assert ( - len(res) == total_length - ), f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})" + assert len(res) == total_length, ( + f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})" + ) if len(res) == 0: # We are in a process which return no output (beginning/middle of the PP group) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index e539b926f..5be1285f8 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -80,7 +80,9 @@ class VLLMModelConfig(ModelConfig): data_parallel_size: PositiveInt = 1 # how many GPUs to use for data parallelism pipeline_parallel_size: PositiveInt = 1 # how many GPUs to use for pipeline parallelism gpu_memory_utilization: NonNegativeFloat = 0.9 # lower this if you are running out of memory - max_model_length: PositiveInt | None = None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough + max_model_length: PositiveInt | None = ( + None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough + ) quantization: str | None = None load_format: str | None = None swap_space: PositiveInt = 4 # CPU swap space size (GiB) per GPU. diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 0b880b452..e9459e0e1 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -383,7 +383,7 @@ def tensor_replacer(match): try: return ast.literal_eval(processed) except Exception as e: - raise ValueError(f"Failed to parse after preprocessing. " f"Processed string:\n{processed}\n\nError: {e}") + raise ValueError(f"Failed to parse after preprocessing. Processed string:\n{processed}\n\nError: {e}") def _load_responses_from_details(self): logger.info("--- LOADING RESPONSES FROM DETAILS ---") diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 8f973ffda..3bdaefc23 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -826,7 +826,7 @@ def ethics_commonsense(line, task_name: str = None): def ethics_deontology(line, task_name: str = None): return Doc( task_name=task_name, - query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']} {line['excuse']}\"\nAnswer:", + query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]} {line["excuse"]}"\nAnswer:', choices=[" unreasonable", " reasonable"], gold_index=int(line["label"]), instruction="Question: Would most people believe this reasonable or unreasonable to say?", @@ -836,7 +836,7 @@ def ethics_deontology(line, task_name: str = None): def ethics_justice(line, task_name: str = None): return Doc( task_name=task_name, - query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']}\"\nAnswer:", + query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]}"\nAnswer:', choices=[" unreasonable", " reasonable"], gold_index=int(line["label"]), instruction="Question: Would most people believe this reasonable or unreasonable to say?", @@ -859,7 +859,7 @@ def ethics_utilitarianism(line, task_name: str = None): def ethics_virtue(line, task_name: str = None): return Doc( task_name=task_name, - query=f"Sentence: {line['scenario']}\nQuestion: Does the character in this sentence exhibit the trait \"{line['trait']}\"?\nAnswer:", + query=f'Sentence: {line["scenario"]}\nQuestion: Does the character in this sentence exhibit the trait "{line["trait"]}"?\nAnswer:', choices=[" no", " yes"], gold_index=int(line["label"]), ) @@ -1236,24 +1236,21 @@ def lextreme_covid19_emergency_event(line, task_name: str = None): def lextreme_multi_eurlex_level_1(line, task_name: str = None): instruction = ( - "In this task, you are given a document from an EU law. " - "Predict the level 1 concept in the EUROVOC taxonomy." + "In this task, you are given a document from an EU law. Predict the level 1 concept in the EUROVOC taxonomy." ) return lextreme(line, instruction, task_name) def lextreme_multi_eurlex_level_2(line, task_name: str = None): instruction = ( - "In this task, you are given a document from an EU law. " - "Predict the level 2 concept in the EUROVOC taxonomy." + "In this task, you are given a document from an EU law. Predict the level 2 concept in the EUROVOC taxonomy." ) return lextreme(line, instruction, task_name) def lextreme_multi_eurlex_level_3(line, task_name: str = None): instruction = ( - "In this task, you are given a document from an EU law. " - "Predict the level 3 concept in the EUROVOC taxonomy." + "In this task, you are given a document from an EU law. Predict the level 3 concept in the EUROVOC taxonomy." ) return lextreme(line, instruction, task_name) @@ -1261,8 +1258,7 @@ def lextreme_multi_eurlex_level_3(line, task_name: str = None): def lextreme_greek_legal_ner(line, task_name: str = None): instruction = ( - "In this task, you are given a sentence from Greek legislation. " - "Predict the named entity type for each token." + "In this task, you are given a sentence from Greek legislation. Predict the named entity type for each token." ) return lextreme(line, instruction, task_name) @@ -1313,7 +1309,7 @@ def legal_summarization(line, task_name: str = None): def mgsm(line, question_key, answer_key, task_name: str = None): if line["answer"] is not None: query = f"{line['question']}\n{answer_key}" - gold = f" {line['answer'][len(answer_key) + 1:]}" + gold = f" {line['answer'][len(answer_key) + 1 :]}" else: query = f"{question_key} {line['question']}\n{answer_key}" gold = f" {str(line['answer_number'])}" diff --git a/src/lighteval/tasks/extended/hle/main.py b/src/lighteval/tasks/extended/hle/main.py index 9f6a85610..76a63c1ad 100644 --- a/src/lighteval/tasks/extended/hle/main.py +++ b/src/lighteval/tasks/extended/hle/main.py @@ -208,7 +208,7 @@ def hle_text_only(line, task_name: str = None): hle_metrics = CorpusLevelMetricGrouping( metric_name=["accuracy", "confidence_half_width", "calibration_error"], - higher_is_better={n: True for n in ["accuracy", "confidence_half_width", "calibration_error"]}, + higher_is_better=dict.fromkeys(["accuracy", "confidence_half_width", "calibration_error"], True), category=MetricCategory.LLM_AS_JUDGE, use_case=MetricUseCase.ACCURACY, sample_level_fn=JudgeLLMHLE().compute, diff --git a/src/lighteval/tasks/extended/ifeval/instructions.py b/src/lighteval/tasks/extended/ifeval/instructions.py index 7c8591ae2..ee9e7b88b 100644 --- a/src/lighteval/tasks/extended/ifeval/instructions.py +++ b/src/lighteval/tasks/extended/ifeval/instructions.py @@ -13,6 +13,7 @@ # limitations under the License. """Library of instructions.""" + import collections import json import logging @@ -204,7 +205,7 @@ def build_description(self, *, num_sentences=None, relation=None): self._comparison_relation = random.choice(_COMPARISON_RELATION) elif relation not in _COMPARISON_RELATION: raise ValueError( - "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given." + f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given." ) else: self._comparison_relation = relation @@ -663,7 +664,7 @@ def build_description(self, *, original_message): A string representing the instruction description. """ if not self.is_change(original_message): - raise ValueError(f"Message {original_message} does not contain changes " "in the form of *change me*.") + raise ValueError(f"Message {original_message} does not contain changes in the form of *change me*.") self._reference_without_change = original_message self._description = ( @@ -694,7 +695,7 @@ def check_following(self, value): """ if not self.is_change(value): - raise ValueError(f"value {value} does not contain " "changes in the form of *change me*.") + raise ValueError(f"value {value} does not contain changes in the form of *change me*.") response_without_changes = self.strip_changes(value) reference_without_changes = self.strip_changes(self._reference_without_change) @@ -782,7 +783,7 @@ def build_description(self, *, keyword=None, frequency=None, relation=None): self._comparison_relation = random.choice(_COMPARISON_RELATION) elif relation not in _COMPARISON_RELATION: raise ValueError( - "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given." + f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given." ) else: self._comparison_relation = relation @@ -846,7 +847,7 @@ def build_description(self, *, num_words=None, relation=None): self._comparison_relation = random.choice(_COMPARISON_RELATION) elif relation not in _COMPARISON_RELATION: raise ValueError( - "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given." + f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given." ) else: self._comparison_relation = relation @@ -878,7 +879,7 @@ class JsonFormat(Instruction): def build_description(self): self._description_pattern = ( - "Entire output should be wrapped in JSON format. You can use markdown" " ticks such as ```." + "Entire output should be wrapped in JSON format. You can use markdown ticks such as ```." ) return self._description_pattern @@ -1250,7 +1251,7 @@ def build_description(self, *, end_phrase=None): if self._end_phrase is None: self._end_phrase = random.choice(_ENDING_OPTIONS) self._description_pattern = ( - "Finish your response with this exact phrase {ender}. " "No other words should follow this phrase." + "Finish your response with this exact phrase {ender}. No other words should follow this phrase." ) return self._description_pattern.format(ender=self._end_phrase) @@ -1274,7 +1275,7 @@ class TitleChecker(Instruction): def build_description(self): """Build the instruction description.""" self._description_pattern = ( - "Your answer must contain a title, wrapped in double angular brackets," " such as <>." + "Your answer must contain a title, wrapped in double angular brackets, such as <>." ) return self._description_pattern @@ -1337,7 +1338,7 @@ def build_description(self, *, letter=None, let_frequency=None, let_relation=Non self._comparison_relation = let_relation self._description_pattern = ( - "In your response, the letter {letter} should appear {let_relation}" " {let_frequency} times." + "In your response, the letter {letter} should appear {let_relation} {let_frequency} times." ) return self._description_pattern.format( @@ -1402,8 +1403,7 @@ class LowercaseLettersEnglishChecker(Instruction): def build_description(self): """Build the instruction description.""" self._description_pattern = ( - "Your entire response should be in English, and in all lowercase" - " letters. No capital letters are allowed." + "Your entire response should be in English, and in all lowercase letters. No capital letters are allowed." ) return self._description_pattern @@ -1479,7 +1479,7 @@ def build_description( ) self._description_pattern = ( - "In your response, words with all capital letters should appear" " {relation} {frequency} times." + "In your response, words with all capital letters should appear {relation} {frequency} times." ) return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation) diff --git a/src/lighteval/tasks/extended/ifeval/instructions_registry.py b/src/lighteval/tasks/extended/ifeval/instructions_registry.py index 611e607dc..62becfbaa 100644 --- a/src/lighteval/tasks/extended/ifeval/instructions_registry.py +++ b/src/lighteval/tasks/extended/ifeval/instructions_registry.py @@ -13,6 +13,7 @@ # limitations under the License. """Registry of all instructions.""" + import lighteval.tasks.extended.ifeval.instructions as instructions diff --git a/src/lighteval/tasks/extended/ifeval/main.py b/src/lighteval/tasks/extended/ifeval/main.py index 60d1be5fa..f460c288a 100644 --- a/src/lighteval/tasks/extended/ifeval/main.py +++ b/src/lighteval/tasks/extended/ifeval/main.py @@ -127,7 +127,7 @@ def agg_inst_level_acc(items): ifeval_metrics = SampleLevelMetricGrouping( metric_name=submetric_names, - higher_is_better={n: True for n in submetric_names}, + higher_is_better=dict.fromkeys(submetric_names, True), category=MetricCategory.GENERATIVE, use_case=MetricUseCase.ACCURACY, sample_level_fn=ifeval_metric, diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py index fae6e89df..3e4cfed6f 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py @@ -26,6 +26,7 @@ Test with `python run_evals_accelerate.py --model_args "pretrained=EleutherAI/pythia-70m" --tasks "extended|tiny:winogrande|0|0,extended|tiny:gsm8k|0|0,extended|tiny:hellaswag|0|0,extended|tiny:arc|0|0,extended|tiny:truthfulqa|0|0" --extended_tasks extended_tasks --output_dir "./evals"` """ + import os import pathlib import pickle @@ -105,10 +106,10 @@ def compute(self, **args): res = ExactMatches( strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer ).compute(**args) - return {m: res for m in self.METRICS} + return dict.fromkeys(self.METRICS, res) else: res = LoglikelihoodAcc().compute(**args) - return {m: res for m in self.METRICS} + return dict.fromkeys(self.METRICS, res) def aggregate(self, y_input): if len(y_input) == self.num_samples and self.estimates is not None: @@ -276,7 +277,7 @@ def aggregate(self, y_input): f"tinybench_metric_{name}", CorpusLevelMetricGrouping( metric_name=TinyCorpusAggregator.METRICS, - higher_is_better={m: True for m in TinyCorpusAggregator.METRICS}, + higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True), sample_level_fn=TinyCorpusAggregator(name).compute, category=category, use_case=use_case, diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index d6b203d58..c9a31904b 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -447,9 +447,9 @@ def construct_requests( for i, choice in enumerate(formatted_doc.choices) ] if self.has_metric_category[MetricCategory.MULTICHOICE_PMI]: - assert ( - formatted_doc.unconditioned_query is not None - ), "Unconditioned query is required for PMI normalization" + assert formatted_doc.unconditioned_query is not None, ( + "Unconditioned query is required for PMI normalization" + ) requests[RequestType.LOGLIKELIHOOD] += [ LoglikelihoodRequest( task_name=current_task_name, diff --git a/tests/test_unit_reorder.py b/tests/test_unit_reorder.py index 8726f5ba0..6212bb646 100644 --- a/tests/test_unit_reorder.py +++ b/tests/test_unit_reorder.py @@ -92,13 +92,13 @@ def test_reorder_dataset(self): original_data = dataset.get_original_order(sorted_data) for i in range(len(sorted_data) - 1): - assert ( - len(sorted_data[i].context) >= len(sorted_data[i + 1].context) - ), f"dataset[{i}][0] = {sorted_data[i].context} is shorter than dataset[{i + 1}][0] = {sorted_data[i + 1].context}" + assert len(sorted_data[i].context) >= len(sorted_data[i + 1].context), ( + f"dataset[{i}][0] = {sorted_data[i].context} is shorter than dataset[{i + 1}][0] = {sorted_data[i + 1].context}" + ) - assert len(sorted_data) == len( - original_data - ), f"reordered dataset has length {len(sorted_data)}, should be {len(dataset)}" + assert len(sorted_data) == len(original_data), ( + f"reordered dataset has length {len(sorted_data)}, should be {len(dataset)}" + ) for sorted_data, orignal in zip(original_data, data): assert sorted_data == orignal