diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0551f9155..b832c5a5c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,7 +34,7 @@ repos:
 
   - repo: https://github.com/charliermarsh/ruff-pre-commit
     # Ruff version.
-    rev: 'v0.2.2'
+    rev: 'v0.11.10'
     hooks:
       - id: ruff
         args: ['--fix']
diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 31b1b8752..55165074a 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -26,6 +26,7 @@
 
 This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
 """
+
 import random
 import re
 from typing import Any, Dict, List, Optional, Union
diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py
index 2fd85f69b..49010098c 100644
--- a/examples/nanotron/custom_evaluation_tasks.py
+++ b/examples/nanotron/custom_evaluation_tasks.py
@@ -26,6 +26,7 @@
 
 This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
 """
+
 import re
 from dataclasses import asdict
 from typing import Dict, List, Tuple
diff --git a/pyproject.toml b/pyproject.toml
index ebca5bf17..931846b74 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -95,7 +95,7 @@ nanotron = [
 ]
 tensorboardX = ["tensorboardX"]
 vllm = ["vllm>=0.7.0", "ray", "more_itertools"]
-quality = ["ruff==v0.2.2","pre-commit"]
+quality = ["ruff>=v0.11.0","pre-commit"]
 tests = ["pytest==7.4.0","deepdiff"]
 dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
 docs = ["hf-doc-builder", "watchdog"]
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 3bf5e7923..e654adcbc 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -603,7 +603,7 @@ def recreate_metadata_card(self, repo_id: str) -> None:  # noqa: C901
             f"To load the details from a run, you can for instance do the following:\n"
             f'```python\nfrom datasets import load_dataset\ndata = load_dataset("{repo_id}",\n\t"{sanitized_task}",\n\tsplit="train")\n```\n\n'
             f"## Latest results\n\n"
-            f'These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace("/resolve/", "/blob/")})'
+            f"These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace('/resolve/', '/blob/')})"
             f"(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
             f'You find each in the results and the "latest" split for each eval):\n\n'
             f"```python\n{results_string}\n```",
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 2305332cd..9f8c9b3c4 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -556,7 +556,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
             if len(list_of_subtasks) > 1:
                 metrics = list(self.metric_aggregated[list_of_subtasks[0]].keys())
                 self.metric_aggregated[average_task] = {
-                    metric: sum([self.metric_aggregated[k][metric] for k in list_of_subtasks]) / len(list_of_subtasks)
+                    metric: sum(self.metric_aggregated[k][metric] for k in list_of_subtasks) / len(list_of_subtasks)
                     for metric in metrics
                 }
 
diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py
index 1012bc3f7..80ee47357 100644
--- a/src/lighteval/metrics/imports/bert_scorer.py
+++ b/src/lighteval/metrics/imports/bert_scorer.py
@@ -22,6 +22,7 @@
 # SOFTWARE.
 
 """Simplified version of the BertScorer lib - we only import what we need."""
+
 import logging
 import os
 import time
diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py
index d383d61f9..3b7e4d537 100644
--- a/src/lighteval/metrics/llm_as_judge.py
+++ b/src/lighteval/metrics/llm_as_judge.py
@@ -127,7 +127,7 @@ def __init__(
         if self.backend == "inference-providers" and self.hf_provider is None:
             raise ValueError("When using 'inference-providers' as backend, you must specify an 'hf_provider'")
 
-    def __lazy_load_client(self):
+    def __lazy_load_client(self):  # noqa: C901
         match self.backend:
             # Both "openai" and "tgi" backends use the OpenAI-compatible API
             # They are handled separately to allow for backend-specific validation and setup
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index 360d55487..efc762dec 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -624,16 +624,16 @@ class Metrics(Enum):
         sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.REASONING,
-        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
-        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
+        higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
     )
     g_pass_at_8_16 = SampleLevelMetricGrouping(
         metric_name="G-Pass@8-16:48_samples",
         sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.REASONING,
-        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
-        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
+        higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
     )
     g_pass_at_16_expr_gold = SampleLevelMetricGrouping(
         metric_name="G-Pass@16:48_samples",
@@ -653,8 +653,8 @@ class Metrics(Enum):
         ).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.REASONING,
-        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
-        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
+        higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
     )
     g_pass_at_16_latex_gold = SampleLevelMetricGrouping(
         metric_name="G-Pass@16:48_samples",
@@ -674,8 +674,8 @@ class Metrics(Enum):
         ).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.REASONING,
-        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
-        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
+        higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
     )
     perfect_exact_match = SampleLevelMetric(
         metric_name="perfect_em",
diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
index 7c12e12ef..030725a53 100644
--- a/src/lighteval/metrics/metrics_corpus.py
+++ b/src/lighteval/metrics/metrics_corpus.py
@@ -24,6 +24,7 @@
 Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus.
 A number of these aggregations come from the EleutherAIHarness
 """
+
 import logging
 import math
 from typing import Literal
diff --git a/src/lighteval/metrics/stderr.py b/src/lighteval/metrics/stderr.py
index cc9c6febb..e8d47e444 100644
--- a/src/lighteval/metrics/stderr.py
+++ b/src/lighteval/metrics/stderr.py
@@ -42,7 +42,7 @@
 
 def _stddev(arr):
     mu = np.mean(arr)
-    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
+    return math.sqrt(sum((x - mu) ** 2 for x in arr) / (len(arr) - 1))
 
 
 def mean_stderr(arr):
diff --git a/src/lighteval/metrics/utils/math_comparison.py b/src/lighteval/metrics/utils/math_comparison.py
index e90f53f7b..6592bb8d8 100644
--- a/src/lighteval/metrics/utils/math_comparison.py
+++ b/src/lighteval/metrics/utils/math_comparison.py
@@ -374,7 +374,7 @@ def are_flipped_inequalities_equal(a: Relational, b: Relational) -> bool:
 
     # Same type of relation (e.g. both <= or both >=)
     try:
-        if type(gold) == type(pred) and sympy_expr_eq(gold.lhs - gold.rhs, pred.lhs - pred.rhs, precision):  # type: ignore
+        if type(gold) is type(pred) and sympy_expr_eq(gold.lhs - gold.rhs, pred.lhs - pred.rhs, precision):  # type: ignore
             return True
     except TimeoutError:
         raise
diff --git a/src/lighteval/models/endpoints/inference_providers_model.py b/src/lighteval/models/endpoints/inference_providers_model.py
index f7dc13943..42a8b50a8 100644
--- a/src/lighteval/models/endpoints/inference_providers_model.py
+++ b/src/lighteval/models/endpoints/inference_providers_model.py
@@ -157,9 +157,9 @@ async def __call_api_parallel(
         results = []
 
         num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples
-        assert len(prompts) == len(
-            num_sampless
-        ), f"Length of prompts and max_new_tokenss should be the same but are {len(prompts)}, {len(num_sampless)}"
+        assert len(prompts) == len(num_sampless), (
+            f"Length of prompts and max_new_tokenss should be the same but are {len(prompts)}, {len(num_sampless)}"
+        )
 
         async def bounded_api_call(prompt, num_samples):
             async with self.semaphore:
diff --git a/src/lighteval/models/endpoints/openai_model.py b/src/lighteval/models/endpoints/openai_model.py
index 42771b1ae..6a6e20c29 100644
--- a/src/lighteval/models/endpoints/openai_model.py
+++ b/src/lighteval/models/endpoints/openai_model.py
@@ -147,9 +147,9 @@ def __call_api_parallel(
         num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples
         logit_biass = [logit_bias for _ in prompts] if logit_bias is None else logit_bias
 
-        assert (
-            len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(logit_biass)
-        ), "Length of prompts, return_logitss, max_new_tokenss, num_sampless, logit_biass should be same"
+        assert len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(logit_biass), (
+            "Length of prompts, return_logitss, max_new_tokenss, num_sampless, logit_biass should be same"
+        )
 
         with ThreadPoolExecutor(self.CONCURENT_CALLS) as executor:
             for entry in tqdm(
@@ -255,11 +255,11 @@ def _loglikelihood_tokens(
             inputs = [sample.context for sample in split]
             max_new_tokens = [len(sample.tokenized_continuation) for sample in split]
 
-            assert all(
-                new_tokens == 1 for new_tokens in max_new_tokens
-            ), "Only single token continuations are supported when using openai API."
+            assert all(new_tokens == 1 for new_tokens in max_new_tokens), (
+                "Only single token continuations are supported when using openai API."
+            )
 
-            logit_biases = [{tok: 100 for tok in sample.tokenized_continuation} for sample in split]
+            logit_biases = [dict.fromkeys(sample.tokenized_continuation, 100) for sample in split]
 
             outputs = self.__call_api_parallel(
                 inputs, return_logits=True, max_new_tokens=max_new_tokens, num_samples=1, logit_bias=logit_biases
diff --git a/src/lighteval/models/litellm_model.py b/src/lighteval/models/litellm_model.py
index 02455c134..b680d3d84 100644
--- a/src/lighteval/models/litellm_model.py
+++ b/src/lighteval/models/litellm_model.py
@@ -185,7 +185,9 @@ def __call_api_parallel(
         stop_sequencess = [stop_sequence for _ in prompts]
         assert (
             len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(stop_sequencess)
-        ), f"Length of prompts, return_logitss, max_new_tokenss, num_sampless, stop_sequences, system_prompts should be the same but are {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, {len(num_sampless)}, {len(stop_sequencess)}"
+        ), (
+            f"Length of prompts, return_logitss, max_new_tokenss, num_sampless, stop_sequences, system_prompts should be the same but are {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, {len(num_sampless)}, {len(stop_sequencess)}"
+        )
 
         with ThreadPoolExecutor(self.CONCURENT_CALLS) as executor:
             for entry in tqdm(
diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py
index 76a81f6f8..373a70d09 100644
--- a/src/lighteval/models/nanotron/nanotron_model.py
+++ b/src/lighteval/models/nanotron/nanotron_model.py
@@ -486,9 +486,9 @@ def prepare_batch(
         We truncate to keep only at most `max_context` tokens
         We pad to `padding_length` tokens
         """
-        assert (
-            full_attention_masks is False
-        ), "full_attention_masks=True means we would be doing attention of padding tokens, which would affect negatively the results."
+        assert full_attention_masks is False, (
+            "full_attention_masks=True means we would be doing attention of padding tokens, which would affect negatively the results."
+        )
         assert pad_on_left is False, "pad_on_left=True not supported yet, see TODOs below"
         current_pp_rank = dist.get_rank(self.parallel_context.pp_pg)
 
@@ -505,9 +505,9 @@ def prepare_batch(
         if max_context is None:
             max_context = self.max_length
 
-        assert (
-            self.parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
-        ), "No reason to have tp_mode==REDUCE_SCATTER when doing inference"
+        assert self.parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE, (
+            "No reason to have tp_mode==REDUCE_SCATTER when doing inference"
+        )
         # if max_context % self.parallel_config.tp != 0:
         #     # We need to round up to the next multiple of self.parallel_config.tp
         #     if (max_context + (self.parallel_config.tp - max_context % self.parallel_config.tp)) < self.max_length:
@@ -860,9 +860,9 @@ def _loglikelihood_single_token(
         #         print(f"i {i} padded: {r.padded}")
 
         if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
-            assert (
-                len(res) == total_length
-            ), f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
+            assert len(res) == total_length, (
+                f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
+            )
 
         if len(res) == 0:
             # We are in a process which return no output (beginning/middle of the PP group)
@@ -1338,9 +1338,9 @@ def greedy_until(
             res = res[: len(res) - to_remove_at_the_end]
 
         if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
-            assert (
-                len(res) == total_length
-            ), f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
+            assert len(res) == total_length, (
+                f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
+            )
 
         if len(res) == 0:
             # We are in a process which return no output (beginning/middle of the PP group)
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
index e539b926f..5be1285f8 100644
--- a/src/lighteval/models/vllm/vllm_model.py
+++ b/src/lighteval/models/vllm/vllm_model.py
@@ -80,7 +80,9 @@ class VLLMModelConfig(ModelConfig):
     data_parallel_size: PositiveInt = 1  # how many GPUs to use for data parallelism
     pipeline_parallel_size: PositiveInt = 1  # how many GPUs to use for pipeline parallelism
     gpu_memory_utilization: NonNegativeFloat = 0.9  # lower this if you are running out of memory
-    max_model_length: PositiveInt | None = None  # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
+    max_model_length: PositiveInt | None = (
+        None  # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
+    )
     quantization: str | None = None
     load_format: str | None = None
     swap_space: PositiveInt = 4  # CPU swap space size (GiB) per GPU.
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 0b880b452..e9459e0e1 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -383,7 +383,7 @@ def tensor_replacer(match):
         try:
             return ast.literal_eval(processed)
         except Exception as e:
-            raise ValueError(f"Failed to parse after preprocessing. " f"Processed string:\n{processed}\n\nError: {e}")
+            raise ValueError(f"Failed to parse after preprocessing. Processed string:\n{processed}\n\nError: {e}")
 
     def _load_responses_from_details(self):
         logger.info("--- LOADING RESPONSES FROM DETAILS ---")
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 8f973ffda..3bdaefc23 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -826,7 +826,7 @@ def ethics_commonsense(line, task_name: str = None):
 def ethics_deontology(line, task_name: str = None):
     return Doc(
         task_name=task_name,
-        query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']} {line['excuse']}\"\nAnswer:",
+        query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]} {line["excuse"]}"\nAnswer:',
         choices=[" unreasonable", " reasonable"],
         gold_index=int(line["label"]),
         instruction="Question: Would most people believe this reasonable or unreasonable to say?",
@@ -836,7 +836,7 @@ def ethics_deontology(line, task_name: str = None):
 def ethics_justice(line, task_name: str = None):
     return Doc(
         task_name=task_name,
-        query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']}\"\nAnswer:",
+        query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]}"\nAnswer:',
         choices=[" unreasonable", " reasonable"],
         gold_index=int(line["label"]),
         instruction="Question: Would most people believe this reasonable or unreasonable to say?",
@@ -859,7 +859,7 @@ def ethics_utilitarianism(line, task_name: str = None):
 def ethics_virtue(line, task_name: str = None):
     return Doc(
         task_name=task_name,
-        query=f"Sentence: {line['scenario']}\nQuestion: Does the character in this sentence exhibit the trait \"{line['trait']}\"?\nAnswer:",
+        query=f'Sentence: {line["scenario"]}\nQuestion: Does the character in this sentence exhibit the trait "{line["trait"]}"?\nAnswer:',
         choices=[" no", " yes"],
         gold_index=int(line["label"]),
     )
@@ -1236,24 +1236,21 @@ def lextreme_covid19_emergency_event(line, task_name: str = None):
 
 def lextreme_multi_eurlex_level_1(line, task_name: str = None):
     instruction = (
-        "In this task, you are given a document from an EU law. "
-        "Predict the level 1 concept in the EUROVOC taxonomy."
+        "In this task, you are given a document from an EU law. Predict the level 1 concept in the EUROVOC taxonomy."
     )
     return lextreme(line, instruction, task_name)
 
 
 def lextreme_multi_eurlex_level_2(line, task_name: str = None):
     instruction = (
-        "In this task, you are given a document from an EU law. "
-        "Predict the level 2 concept in the EUROVOC taxonomy."
+        "In this task, you are given a document from an EU law. Predict the level 2 concept in the EUROVOC taxonomy."
     )
     return lextreme(line, instruction, task_name)
 
 
 def lextreme_multi_eurlex_level_3(line, task_name: str = None):
     instruction = (
-        "In this task, you are given a document from an EU law. "
-        "Predict the level 3 concept in the EUROVOC taxonomy."
+        "In this task, you are given a document from an EU law. Predict the level 3 concept in the EUROVOC taxonomy."
     )
 
     return lextreme(line, instruction, task_name)
@@ -1261,8 +1258,7 @@ def lextreme_multi_eurlex_level_3(line, task_name: str = None):
 
 def lextreme_greek_legal_ner(line, task_name: str = None):
     instruction = (
-        "In this task, you are given a sentence from Greek legislation. "
-        "Predict the named entity type for each token."
+        "In this task, you are given a sentence from Greek legislation. Predict the named entity type for each token."
     )
     return lextreme(line, instruction, task_name)
 
@@ -1313,7 +1309,7 @@ def legal_summarization(line, task_name: str = None):
 def mgsm(line, question_key, answer_key, task_name: str = None):
     if line["answer"] is not None:
         query = f"{line['question']}\n{answer_key}"
-        gold = f" {line['answer'][len(answer_key) + 1:]}"
+        gold = f" {line['answer'][len(answer_key) + 1 :]}"
     else:
         query = f"{question_key} {line['question']}\n{answer_key}"
         gold = f" {str(line['answer_number'])}"
diff --git a/src/lighteval/tasks/extended/hle/main.py b/src/lighteval/tasks/extended/hle/main.py
index 9f6a85610..76a63c1ad 100644
--- a/src/lighteval/tasks/extended/hle/main.py
+++ b/src/lighteval/tasks/extended/hle/main.py
@@ -208,7 +208,7 @@ def hle_text_only(line, task_name: str = None):
 
 hle_metrics = CorpusLevelMetricGrouping(
     metric_name=["accuracy", "confidence_half_width", "calibration_error"],
-    higher_is_better={n: True for n in ["accuracy", "confidence_half_width", "calibration_error"]},
+    higher_is_better=dict.fromkeys(["accuracy", "confidence_half_width", "calibration_error"], True),
     category=MetricCategory.LLM_AS_JUDGE,
     use_case=MetricUseCase.ACCURACY,
     sample_level_fn=JudgeLLMHLE().compute,
diff --git a/src/lighteval/tasks/extended/ifeval/instructions.py b/src/lighteval/tasks/extended/ifeval/instructions.py
index 7c8591ae2..ee9e7b88b 100644
--- a/src/lighteval/tasks/extended/ifeval/instructions.py
+++ b/src/lighteval/tasks/extended/ifeval/instructions.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 """Library of instructions."""
+
 import collections
 import json
 import logging
@@ -204,7 +205,7 @@ def build_description(self, *, num_sentences=None, relation=None):
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif relation not in _COMPARISON_RELATION:
             raise ValueError(
-                "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given."
+                f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given."
             )
         else:
             self._comparison_relation = relation
@@ -663,7 +664,7 @@ def build_description(self, *, original_message):
           A string representing the instruction description.
         """
         if not self.is_change(original_message):
-            raise ValueError(f"Message {original_message} does not contain changes " "in the form of *change me*.")
+            raise ValueError(f"Message {original_message} does not contain changes in the form of *change me*.")
 
         self._reference_without_change = original_message
         self._description = (
@@ -694,7 +695,7 @@ def check_following(self, value):
         """
 
         if not self.is_change(value):
-            raise ValueError(f"value {value} does not contain " "changes in the form of *change me*.")
+            raise ValueError(f"value {value} does not contain changes in the form of *change me*.")
 
         response_without_changes = self.strip_changes(value)
         reference_without_changes = self.strip_changes(self._reference_without_change)
@@ -782,7 +783,7 @@ def build_description(self, *, keyword=None, frequency=None, relation=None):
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif relation not in _COMPARISON_RELATION:
             raise ValueError(
-                "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given."
+                f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given."
             )
         else:
             self._comparison_relation = relation
@@ -846,7 +847,7 @@ def build_description(self, *, num_words=None, relation=None):
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif relation not in _COMPARISON_RELATION:
             raise ValueError(
-                "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given."
+                f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given."
             )
         else:
             self._comparison_relation = relation
@@ -878,7 +879,7 @@ class JsonFormat(Instruction):
 
     def build_description(self):
         self._description_pattern = (
-            "Entire output should be wrapped in JSON format. You can use markdown" " ticks such as ```."
+            "Entire output should be wrapped in JSON format. You can use markdown ticks such as ```."
         )
         return self._description_pattern
 
@@ -1250,7 +1251,7 @@ def build_description(self, *, end_phrase=None):
         if self._end_phrase is None:
             self._end_phrase = random.choice(_ENDING_OPTIONS)
         self._description_pattern = (
-            "Finish your response with this exact phrase {ender}. " "No other words should follow this phrase."
+            "Finish your response with this exact phrase {ender}. No other words should follow this phrase."
         )
         return self._description_pattern.format(ender=self._end_phrase)
 
@@ -1274,7 +1275,7 @@ class TitleChecker(Instruction):
     def build_description(self):
         """Build the instruction description."""
         self._description_pattern = (
-            "Your answer must contain a title, wrapped in double angular brackets," " such as <<poem of joy>>."
+            "Your answer must contain a title, wrapped in double angular brackets, such as <<poem of joy>>."
         )
         return self._description_pattern
 
@@ -1337,7 +1338,7 @@ def build_description(self, *, letter=None, let_frequency=None, let_relation=Non
             self._comparison_relation = let_relation
 
         self._description_pattern = (
-            "In your response, the letter {letter} should appear {let_relation}" " {let_frequency} times."
+            "In your response, the letter {letter} should appear {let_relation} {let_frequency} times."
         )
 
         return self._description_pattern.format(
@@ -1402,8 +1403,7 @@ class LowercaseLettersEnglishChecker(Instruction):
     def build_description(self):
         """Build the instruction description."""
         self._description_pattern = (
-            "Your entire response should be in English, and in all lowercase"
-            " letters. No capital letters are allowed."
+            "Your entire response should be in English, and in all lowercase letters. No capital letters are allowed."
         )
         return self._description_pattern
 
@@ -1479,7 +1479,7 @@ def build_description(
             )
 
         self._description_pattern = (
-            "In your response, words with all capital letters should appear" " {relation} {frequency} times."
+            "In your response, words with all capital letters should appear {relation} {frequency} times."
         )
 
         return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)
diff --git a/src/lighteval/tasks/extended/ifeval/instructions_registry.py b/src/lighteval/tasks/extended/ifeval/instructions_registry.py
index 611e607dc..62becfbaa 100644
--- a/src/lighteval/tasks/extended/ifeval/instructions_registry.py
+++ b/src/lighteval/tasks/extended/ifeval/instructions_registry.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 """Registry of all instructions."""
+
 import lighteval.tasks.extended.ifeval.instructions as instructions
 
 
diff --git a/src/lighteval/tasks/extended/ifeval/main.py b/src/lighteval/tasks/extended/ifeval/main.py
index 60d1be5fa..f460c288a 100644
--- a/src/lighteval/tasks/extended/ifeval/main.py
+++ b/src/lighteval/tasks/extended/ifeval/main.py
@@ -127,7 +127,7 @@ def agg_inst_level_acc(items):
 
 ifeval_metrics = SampleLevelMetricGrouping(
     metric_name=submetric_names,
-    higher_is_better={n: True for n in submetric_names},
+    higher_is_better=dict.fromkeys(submetric_names, True),
     category=MetricCategory.GENERATIVE,
     use_case=MetricUseCase.ACCURACY,
     sample_level_fn=ifeval_metric,
diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py
index fae6e89df..3e4cfed6f 100644
--- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py
+++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py
@@ -26,6 +26,7 @@
 
 Test with `python run_evals_accelerate.py --model_args "pretrained=EleutherAI/pythia-70m" --tasks "extended|tiny:winogrande|0|0,extended|tiny:gsm8k|0|0,extended|tiny:hellaswag|0|0,extended|tiny:arc|0|0,extended|tiny:truthfulqa|0|0" --extended_tasks extended_tasks --output_dir "./evals"`
 """
+
 import os
 import pathlib
 import pickle
@@ -105,10 +106,10 @@ def compute(self, **args):
             res = ExactMatches(
                 strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer
             ).compute(**args)
-            return {m: res for m in self.METRICS}
+            return dict.fromkeys(self.METRICS, res)
         else:
             res = LoglikelihoodAcc().compute(**args)
-            return {m: res for m in self.METRICS}
+            return dict.fromkeys(self.METRICS, res)
 
     def aggregate(self, y_input):
         if len(y_input) == self.num_samples and self.estimates is not None:
@@ -276,7 +277,7 @@ def aggregate(self, y_input):
         f"tinybench_metric_{name}",
         CorpusLevelMetricGrouping(
             metric_name=TinyCorpusAggregator.METRICS,
-            higher_is_better={m: True for m in TinyCorpusAggregator.METRICS},
+            higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True),
             sample_level_fn=TinyCorpusAggregator(name).compute,
             category=category,
             use_case=use_case,
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index d6b203d58..c9a31904b 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -447,9 +447,9 @@ def construct_requests(
                 for i, choice in enumerate(formatted_doc.choices)
             ]
         if self.has_metric_category[MetricCategory.MULTICHOICE_PMI]:
-            assert (
-                formatted_doc.unconditioned_query is not None
-            ), "Unconditioned query is required for PMI normalization"
+            assert formatted_doc.unconditioned_query is not None, (
+                "Unconditioned query is required for PMI normalization"
+            )
             requests[RequestType.LOGLIKELIHOOD] += [
                 LoglikelihoodRequest(
                     task_name=current_task_name,
diff --git a/tests/test_unit_reorder.py b/tests/test_unit_reorder.py
index 8726f5ba0..6212bb646 100644
--- a/tests/test_unit_reorder.py
+++ b/tests/test_unit_reorder.py
@@ -92,13 +92,13 @@ def test_reorder_dataset(self):
         original_data = dataset.get_original_order(sorted_data)
 
         for i in range(len(sorted_data) - 1):
-            assert (
-                len(sorted_data[i].context) >= len(sorted_data[i + 1].context)
-            ), f"dataset[{i}][0] = {sorted_data[i].context} is shorter than dataset[{i + 1}][0] = {sorted_data[i + 1].context}"
+            assert len(sorted_data[i].context) >= len(sorted_data[i + 1].context), (
+                f"dataset[{i}][0] = {sorted_data[i].context} is shorter than dataset[{i + 1}][0] = {sorted_data[i + 1].context}"
+            )
 
-        assert len(sorted_data) == len(
-            original_data
-        ), f"reordered dataset has length {len(sorted_data)}, should be {len(dataset)}"
+        assert len(sorted_data) == len(original_data), (
+            f"reordered dataset has length {len(sorted_data)}, should be {len(dataset)}"
+        )
 
         for sorted_data, orignal in zip(original_data, data):
             assert sorted_data == orignal