diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 0aede953d..360d55487 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -158,6 +158,14 @@ class Metrics(Enum): corpus_level_fn=CorpusLevelTranslationMetric("chrf").compute, higher_is_better=True, ) + chrf_plus = CorpusLevelMetric( + metric_name="chrf++", + sample_level_fn=GenerativePreparator().prepare, + category=MetricCategory.GENERATIVE, + use_case=MetricUseCase.TRANSLATION, + corpus_level_fn=CorpusLevelTranslationMetric("chrf++").compute, + higher_is_better=True, + ) copyright = SampleLevelMetricGrouping( metric_name=["longest_common_prefix_length", "edit_distance", "edit_similarity"], sample_level_fn=StringDistance( diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 3c2de418f..7c12e12ef 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -104,6 +104,8 @@ def get_metric(self): return sacrebleu.BLEU(trg_lang=self.lang) elif self.metric_type == "chrf": return sacrebleu.CHRF() + elif self.metric_type == "chrf++": + return sacrebleu.CHRF(word_order=2) elif self.metric_type == "ter": return sacrebleu.TER(asian_support=True if self.lang != "" else False) else: diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py index 3d92a71e2..dcf7ad564 100644 --- a/src/lighteval/tasks/multilingual/tasks.py +++ b/src/lighteval/tasks/multilingual/tasks.py @@ -21,6 +21,7 @@ # SOFTWARE. from functools import partial +from itertools import combinations from langcodes import Language as LangCodeLanguage from langcodes import standardize_tag @@ -30,6 +31,7 @@ multilingual_quasi_exact_match_metric, multilingual_quasi_f1_score_metric, ) +from lighteval.metrics.metrics import Metrics from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.lighteval_task import LightevalTaskConfig @@ -52,13 +54,14 @@ from lighteval.tasks.templates.multichoice import get_mcq_prompt_function from lighteval.tasks.templates.nli import get_nli_prompt_function from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.tasks.templates.translation import get_translation_prompt_function from lighteval.tasks.templates.utils.formulation import ( CFFormulation, HybridFormulation, MCFFormulation, ) from lighteval.tasks.templates.utils.translation_literals import TRANSLATION_LITERALS -from lighteval.utils.language import Language, iso_639_3_ind_to_iso_639_3_macro +from lighteval.utils.language import Language, iso_639_3_ind_to_iso_639_3_macro, manage_duplicate_language_codes TASKS_TABLE = [] @@ -3903,3 +3906,250 @@ *hindi_boolq_tasks, ] ) + +# ------------------------------- Translation Tasks ------------------------------- # +flores_200_languages = [ + # "ace_Arab", + "ace_Latn", + "acm_Arab", + "acq_Arab", + "aeb_Arab", + "afr_Latn", + "ajp_Arab", + "aka_Latn", + "amh_Ethi", + "apc_Arab", + "arb_Arab", + # "arb_Latn", + "ars_Arab", + "ary_Arab", + "arz_Arab", + "asm_Beng", + "ast_Latn", + "awa_Deva", + "ayr_Latn", + "azb_Arab", + "azj_Latn", + "bak_Cyrl", + "bam_Latn", + "ban_Latn", + "bel_Cyrl", + "bem_Latn", + "ben_Beng", + "bho_Deva", + # "bjn_Arab", + "bjn_Latn", + "bod_Tibt", + "bos_Latn", + "bug_Latn", + "bul_Cyrl", + "cat_Latn", + "ceb_Latn", + "ces_Latn", + "cjk_Latn", + "ckb_Arab", + "crh_Latn", + "cym_Latn", + "dan_Latn", + "deu_Latn", + "dik_Latn", + "dyu_Latn", + "dzo_Tibt", + "ell_Grek", + "eng_Latn", + "epo_Latn", + "est_Latn", + "eus_Latn", + "ewe_Latn", + "fao_Latn", + "fij_Latn", + "fin_Latn", + "fon_Latn", + "fra_Latn", + "fur_Latn", + "fuv_Latn", + "gla_Latn", + "gle_Latn", + "glg_Latn", + "grn_Latn", + "guj_Gujr", + "hat_Latn", + "hau_Latn", + "heb_Hebr", + "hin_Deva", + "hne_Deva", + "hrv_Latn", + "hun_Latn", + "hye_Armn", + "ibo_Latn", + "ilo_Latn", + "ind_Latn", + "isl_Latn", + "ita_Latn", + "jav_Latn", + "jpn_Jpan", + "kab_Latn", + "kac_Latn", + "kam_Latn", + "kan_Knda", + # "kas_Arab", + "kas_Deva", + "kat_Geor", + # "knc_Arab", + "knc_Latn", + "kaz_Cyrl", + "kbp_Latn", + "kea_Latn", + "khm_Khmr", + "kik_Latn", + "kin_Latn", + "kir_Cyrl", + "kmb_Latn", + "kmr_Latn", + "kon_Latn", + "kor_Hang", + "lao_Laoo", + "lij_Latn", + "lim_Latn", + "lin_Latn", + "lit_Latn", + "lmo_Latn", + "ltg_Latn", + "ltz_Latn", + "lua_Latn", + "lug_Latn", + "luo_Latn", + "lus_Latn", + "lvs_Latn", + "mag_Deva", + "mai_Deva", + "mal_Mlym", + "mar_Deva", + # "min_Arab", + "min_Latn", + "mkd_Cyrl", + "plt_Latn", + "mlt_Latn", + "mni_Beng", + "khk_Cyrl", + "mos_Latn", + "mri_Latn", + "mya_Mymr", + "nld_Latn", + "nno_Latn", + "nob_Latn", + "npi_Deva", + "nso_Latn", + "nus_Latn", + "nya_Latn", + "oci_Latn", + "gaz_Latn", + "ory_Orya", + "pag_Latn", + "pan_Guru", + "pap_Latn", + "pes_Arab", + "pol_Latn", + "por_Latn", + "prs_Arab", + "pbt_Arab", + "quy_Latn", + "ron_Latn", + "run_Latn", + "rus_Cyrl", + "sag_Latn", + "san_Deva", + "sat_Olck", + "scn_Latn", + "shn_Mymr", + "sin_Sinh", + "slk_Latn", + "slv_Latn", + "smo_Latn", + "sna_Latn", + "snd_Arab", + "som_Latn", + "sot_Latn", + "spa_Latn", + "als_Latn", + "srd_Latn", + "srp_Cyrl", + "ssw_Latn", + "sun_Latn", + "swe_Latn", + "swh_Latn", + "szl_Latn", + "tam_Taml", + "tat_Cyrl", + "tel_Telu", + "tgk_Cyrl", + "tgl_Latn", + "tha_Thai", + "tir_Ethi", + "taq_Latn", + "taq_Tfng", + "tpi_Latn", + "tsn_Latn", + "tso_Latn", + "tuk_Latn", + "tum_Latn", + "tur_Latn", + "twi_Latn", + "tzm_Tfng", + "uig_Arab", + "ukr_Cyrl", + "umb_Latn", + "urd_Arab", + "uzn_Latn", + "vec_Latn", + "vie_Latn", + "war_Latn", + "wol_Latn", + "xho_Latn", + "ydd_Hebr", + "yor_Latn", + "yue_Hant", + "zho_Hans", + # "zho_Hant", + "zsm_Latn", + "zul_Latn", +] + + +def flores_adapter(lang1, lang2): + return lambda line: { + "source_text": line[f"sentence_{lang1}"], + "target_text": line[f"sentence_{lang2}"], + } + + +flores200_tasks = [ + LightevalTaskConfig( + name=f"flores200:{lang1}-{lang2}", + prompt_function=get_translation_prompt_function( + source_language=Language(manage_duplicate_language_codes(lang1.split("_")[0])), + target_language=Language(manage_duplicate_language_codes(lang2.split("_")[0])), + adapter=flores_adapter(lang1, lang2), + formulation=CFFormulation(), + ), + suite=("lighteval",), + hf_repo="facebook/flores", + hf_subset=f"{lang1}-{lang2}", + hf_avail_splits=["dev", "devtest"], + evaluation_splits=["devtest"], + few_shots_split="dev", + few_shots_select=None, + generation_size=300, + metric=[Metrics.chrf_plus, Metrics.bleu, Metrics.bleu_1, Metrics.bleu_4], + stop_sequence=["\n"], + trust_dataset=True, + version=0, + ) + for (lang1, lang2) in combinations(flores_200_languages, 2) +] + +TASKS_TABLE.extend( + [ + *flores200_tasks, + ] +) diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index 0e5c8592d..740475cb7 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -75,7 +75,9 @@ def __getattribute__(self, name: str) -> str: TRANSLATION_LITERALS: dict[Language, TranslationLiterals] = { + Language.ACEHNESE: TranslationLiterals(language=Language.ACEHNESE), Language.AFRIKAANS: TranslationLiterals(language=Language.AFRIKAANS), + Language.AKAN: TranslationLiterals(language=Language.AKAN), Language.ALBANIAN: TranslationLiterals(language=Language.ALBANIAN), Language.AMHARIC: TranslationLiterals(language=Language.AMHARIC), Language.ARABIC: TranslationLiterals( @@ -104,7 +106,13 @@ def __getattribute__(self, name: str) -> str: ), Language.ARMENIAN: TranslationLiterals(language=Language.ARMENIAN), Language.ASSAMESE: TranslationLiterals(language=Language.ASSAMESE), + Language.ASTURIAN: TranslationLiterals(language=Language.ASTURIAN), + Language.AWADHI: TranslationLiterals(language=Language.AWADHI), + Language.AYACUCHO_QUECHUA: TranslationLiterals(language=Language.AYACUCHO_QUECHUA), Language.AZERBAIJANI: TranslationLiterals(language=Language.AZERBAIJANI), + Language.BALINESE: TranslationLiterals(language=Language.BALINESE), + Language.BAMBARA: TranslationLiterals(language=Language.BAMBARA), + Language.BANJAR: TranslationLiterals(language=Language.BANJAR), Language.BASHKIR: TranslationLiterals( language=Language.BASHKIR, question_word="һорау", @@ -174,10 +182,13 @@ def __getattribute__(self, name: str) -> str: colon=":", indices=["А", "Б", "В", "Г", "Д", "Е"], ), + Language.BEMBA: TranslationLiterals(language=Language.BEMBA), Language.BENGALI: TranslationLiterals(language=Language.BENGALI, question_word="প্রশ্ন"), + Language.BHOJPURI: TranslationLiterals(language=Language.BHOJPURI), Language.BIHARI: TranslationLiterals(language=Language.BIHARI), # Deprecated Language.BOSNIAN: TranslationLiterals(language=Language.BOSNIAN), Language.BRETON: TranslationLiterals(language=Language.BRETON), + Language.BUGINESE: TranslationLiterals(language=Language.BUGINESE), Language.BULGARIAN: TranslationLiterals(language=Language.BULGARIAN), Language.BURMESE: TranslationLiterals(language=Language.BURMESE), Language.CATALAN: TranslationLiterals( @@ -204,6 +215,11 @@ def __getattribute__(self, name: str) -> str: semicolon=";", ), Language.CEBUANO: TranslationLiterals(language=Language.CEBUANO), + Language.CENTRAL_ATLAS_TAMAZIGHT: TranslationLiterals(language=Language.CENTRAL_ATLAS_TAMAZIGHT), + Language.CENTRAL_AYMARA: TranslationLiterals(language=Language.CENTRAL_AYMARA), + Language.CENTRAL_KANURI: TranslationLiterals(language=Language.CENTRAL_KANURI), + Language.CENTRAL_KURDISH: TranslationLiterals(language=Language.CENTRAL_KURDISH), + Language.CHHATTISGARHI: TranslationLiterals(language=Language.CHHATTISGARHI), Language.CHINESE: TranslationLiterals( language=Language.CHINESE, question_word="问题", @@ -228,6 +244,8 @@ def __getattribute__(self, name: str) -> str: colon=":", indices=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"], ), + Language.CHOKWE: TranslationLiterals(language=Language.CHOKWE), + Language.CRIMEAN_TATAR: TranslationLiterals(language=Language.CRIMEAN_TATAR), Language.CROATIAN: TranslationLiterals( language=Language.CROATIAN, question_word="pitanje", @@ -275,6 +293,7 @@ def __getattribute__(self, name: str) -> str: semicolon=";", ), Language.DANISH: TranslationLiterals(language=Language.DANISH), + Language.DARI: TranslationLiterals(language=Language.DARI), Language.DIVEHI: TranslationLiterals(language=Language.DIVEHI), Language.DUTCH: TranslationLiterals( language=Language.DUTCH, @@ -299,6 +318,11 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.DYULA: TranslationLiterals(language=Language.DYULA), + Language.DZONGKHA: TranslationLiterals(language=Language.DZONGKHA), + Language.EASTERN_PANJABI: TranslationLiterals(language=Language.EASTERN_PANJABI), + Language.EASTERN_YIDDISH: TranslationLiterals(language=Language.EASTERN_YIDDISH), + Language.EGYPTIAN_ARABIC: TranslationLiterals(language=Language.EGYPTIAN_ARABIC), Language.ENGLISH: TranslationLiterals( language=Language.ENGLISH, question_word="question", @@ -329,6 +353,9 @@ def __getattribute__(self, name: str) -> str: cause_word="sest", effect_word="seetõttu", ), + Language.EWE: TranslationLiterals(language=Language.EWE), + Language.FAROESE: TranslationLiterals(language=Language.FAROESE), + Language.FIJIAN: TranslationLiterals(language=Language.FIJIAN), Language.FINNISH: TranslationLiterals( language=Language.FINNISH, question_word="kysymys", @@ -352,6 +379,7 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.FON: TranslationLiterals(language=Language.FON), Language.FRENCH: TranslationLiterals( language=Language.FRENCH, question_word="question", @@ -375,6 +403,7 @@ def __getattribute__(self, name: str) -> str: sentence_space=" ", colon=":", ), + Language.FRIULIAN: TranslationLiterals(language=Language.FRIULIAN), Language.GALICIAN: TranslationLiterals( language=Language.GALICIAN, question_word="pregunta", @@ -398,6 +427,7 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.GANDA: TranslationLiterals(language=Language.GANDA), Language.GEORGIAN: TranslationLiterals(language=Language.GEORGIAN), Language.GERMAN: TranslationLiterals( language=Language.GERMAN, @@ -445,6 +475,7 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon="·", ), + Language.GUARANI: TranslationLiterals(language=Language.GUARANI), Language.GUJARATI: TranslationLiterals(language=Language.GUJARATI), Language.HAITIAN: TranslationLiterals( # From https://github.com/EleutherAI/lm-evaluation-harness/blob/0845b588303f1f59af98dd1c5bdbd78a9e75a1e2/lm_eval/tasks/xcopa/utils.py @@ -452,6 +483,9 @@ def __getattribute__(self, name: str) -> str: cause_word="poukisa", effect_word="donk sa", ), + Language.HAITIAN_CREOLE: TranslationLiterals(language=Language.HAITIAN_CREOLE), + Language.HALH_MONGOLIAN: TranslationLiterals(language=Language.HALH_MONGOLIAN), + Language.HAUSA: TranslationLiterals(language=Language.HAUSA), Language.HEBREW: TranslationLiterals(language=Language.HEBREW), Language.HINDI: TranslationLiterals( language=Language.HINDI, @@ -501,6 +535,8 @@ def __getattribute__(self, name: str) -> str: semicolon=";", ), Language.ICELANDIC: TranslationLiterals(language=Language.ICELANDIC), + Language.IGBO: TranslationLiterals(language=Language.IGBO), + Language.ILOCANO: TranslationLiterals(language=Language.ILOCANO), Language.INDONESIAN: TranslationLiterals( language=Language.INDONESIAN, question_word="pertanyaan", @@ -572,9 +608,19 @@ def __getattribute__(self, name: str) -> str: semicolon=";", ), Language.JAVANESE: TranslationLiterals(language=Language.JAVANESE), + Language.JINGPHO: TranslationLiterals(language=Language.JINGPHO), + Language.KABIYE: TranslationLiterals(language=Language.KABIYE), + Language.KABUVERDIANU: TranslationLiterals(language=Language.KABUVERDIANU), + Language.KABYLE: TranslationLiterals(language=Language.KABYLE), + Language.KAMBA: TranslationLiterals(language=Language.KAMBA), Language.KANNADA: TranslationLiterals(language=Language.KANNADA), + Language.KASHMIRI: TranslationLiterals(language=Language.KASHMIRI), Language.KAZAKH: TranslationLiterals(language=Language.KAZAKH), Language.KHMER: TranslationLiterals(language=Language.KHMER), + Language.KIKONGO: TranslationLiterals(language=Language.KIKONGO), + Language.KIKUYU: TranslationLiterals(language=Language.KIKUYU), + Language.KIMBUNDU: TranslationLiterals(language=Language.KIMBUNDU), + Language.KINYARWANDA: TranslationLiterals(language=Language.KINYARWANDA), Language.KIRGHIZ: TranslationLiterals(language=Language.KIRGHIZ), Language.KOREAN: TranslationLiterals( language=Language.KOREAN, @@ -583,18 +629,43 @@ def __getattribute__(self, name: str) -> str: no="아니오", ), Language.KURDISH: TranslationLiterals(language=Language.KURDISH), + Language.KYRGYZ: TranslationLiterals(language=Language.KYRGYZ), Language.LAO: TranslationLiterals(language=Language.LAO), + Language.LATGALIAN: TranslationLiterals(language=Language.LATGALIAN), Language.LATIN: TranslationLiterals(language=Language.LATIN), Language.LATVIAN: TranslationLiterals(language=Language.LATVIAN), + Language.LIGURIAN: TranslationLiterals(language=Language.LIGURIAN), + Language.LIMBURGISH: TranslationLiterals(language=Language.LIMBURGISH), + Language.LINGALA: TranslationLiterals(language=Language.LINGALA), Language.LITHUANIAN: TranslationLiterals(language=Language.LITHUANIAN), + Language.LOMBARD: TranslationLiterals(language=Language.LOMBARD), + Language.LUBA_KASAI: TranslationLiterals(language=Language.LUBA_KASAI), + Language.LUO: TranslationLiterals(language=Language.LUO), Language.LUXEMBOURGISH: TranslationLiterals(language=Language.LUXEMBOURGISH), Language.MACEDONIAN: TranslationLiterals(language=Language.MACEDONIAN), + Language.MAGAHI: TranslationLiterals(language=Language.MAGAHI), + Language.MAITHILI: TranslationLiterals(language=Language.MAITHILI), Language.MALAGASY: TranslationLiterals(language=Language.MALAGASY), Language.MALAY: TranslationLiterals(language=Language.MALAY), Language.MALAYALAM: TranslationLiterals(language=Language.MALAYALAM), Language.MALTESE: TranslationLiterals(language=Language.MALTESE), + Language.MAORI: TranslationLiterals(language=Language.MAORI), Language.MARATHI: TranslationLiterals(language=Language.MARATHI), + Language.MEITEI: TranslationLiterals(language=Language.MEITEI), + Language.MESOPOTAMIAN_ARABIC: TranslationLiterals(language=Language.MESOPOTAMIAN_ARABIC), + Language.MINANGKABAU: TranslationLiterals(language=Language.MINANGKABAU), + Language.MIZO: TranslationLiterals(language=Language.MIZO), + Language.MODERN_STANDARD_ARABIC: TranslationLiterals(language=Language.MODERN_STANDARD_ARABIC), + Language.MOROCCAN_ARABIC: TranslationLiterals(language=Language.MOROCCAN_ARABIC), + Language.MOSSI: TranslationLiterals(language=Language.MOSSI), + Language.NAJDI_ARABIC: TranslationLiterals(language=Language.NAJDI_ARABIC), Language.NEPALI: TranslationLiterals(language=Language.NEPALI), + Language.NIGERIAN_FULFULDE: TranslationLiterals(language=Language.NIGERIAN_FULFULDE), + Language.NORTHERN_KURDISH: TranslationLiterals(language=Language.NORTHERN_KURDISH), + Language.NORTHERN_SOTHO: TranslationLiterals(language=Language.NORTHERN_SOTHO), + Language.NORTHERN_UZBEK: TranslationLiterals(language=Language.NORTHERN_UZBEK), + Language.NORTH_AZERBAIJANI: TranslationLiterals(language=Language.NORTH_AZERBAIJANI), + Language.NORTH_LEVANTINE_ARABIC: TranslationLiterals(language=Language.NORTH_LEVANTINE_ARABIC), Language.NORWEGIAN: TranslationLiterals( language=Language.NORWEGIAN, question_word="spørsmål", @@ -618,11 +689,18 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.NORWEGIAN_BOKMAL: TranslationLiterals(language=Language.NORWEGIAN_BOKMAL), Language.NORWEGIAN_NYNORSK: TranslationLiterals(language=Language.NORWEGIAN_NYNORSK), + Language.NUER: TranslationLiterals(language=Language.NUER), + Language.NYANJA: TranslationLiterals(language=Language.NYANJA), Language.OCCITAN: TranslationLiterals(language=Language.OCCITAN), + Language.ODIA: TranslationLiterals(language=Language.ODIA), Language.ORIYA: TranslationLiterals(language=Language.ORIYA), + Language.PANGASINAN: TranslationLiterals(language=Language.PANGASINAN), + Language.PAPIAMENTO: TranslationLiterals(language=Language.PAPIAMENTO), Language.PASHTO: TranslationLiterals(language=Language.PASHTO), Language.PERSIAN: TranslationLiterals(language=Language.PERSIAN), + Language.PLATEAU_MALAGASY: TranslationLiterals(language=Language.PLATEAU_MALAGASY), Language.POLISH: TranslationLiterals( language=Language.POLISH, question_word="pytanie", @@ -677,6 +755,7 @@ def __getattribute__(self, name: str) -> str: effect_word="chaymi", ), Language.ROMANIAN: TranslationLiterals(language=Language.ROMANIAN), + Language.RUNDI: TranslationLiterals(language=Language.RUNDI), Language.RUSSIAN: TranslationLiterals( language=Language.RUSSIAN, question_word="вопрос", @@ -700,7 +779,12 @@ def __getattribute__(self, name: str) -> str: colon=":", indices=["А", "Б", "В", "Г", "Д", "Е"], ), + Language.SAMOAN: TranslationLiterals(language=Language.SAMOAN), + Language.SANGO: TranslationLiterals(language=Language.SANGO), Language.SANSKRIT: TranslationLiterals(language=Language.SANSKRIT), + Language.SANTALI: TranslationLiterals(language=Language.SANTALI), + Language.SARDINIAN: TranslationLiterals(language=Language.SARDINIAN), + Language.SCOTTISH_GAELIC: TranslationLiterals(language=Language.SCOTTISH_GAELIC), # Latin serbian script for future when separating scipts # Language.SERBIAN_LATIN: TranslationLiterals(language=Language.SERBIAN_LATIN, # question_word="pitanje", @@ -763,6 +847,9 @@ def __getattribute__(self, name: str) -> str: colon=":", indices=["ၵ", "ၶ", "င", "ၸ", "သ", "ၺ"], ), + Language.SHONA: TranslationLiterals(language=Language.SHONA), + Language.SICILIAN: TranslationLiterals(language=Language.SICILIAN), + Language.SILESIAN: TranslationLiterals(language=Language.SILESIAN), Language.SINDHI: TranslationLiterals(language=Language.SINDHI), Language.SINHALA: TranslationLiterals(language=Language.SINHALA), Language.SLOVAK: TranslationLiterals( @@ -788,9 +875,14 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.SLOVENIAN: TranslationLiterals(language=Language.SLOVENIAN), Language.SOMALI: TranslationLiterals(language=Language.SOMALI), Language.SORANI: TranslationLiterals(language=Language.SORANI), + Language.SOUTHERN_PASHTO: TranslationLiterals(language=Language.SOUTHERN_PASHTO), + Language.SOUTHERN_SOTHO: TranslationLiterals(language=Language.SOUTHERN_SOTHO), + Language.SOUTHWESTERN_DINKA: TranslationLiterals(language=Language.SOUTHWESTERN_DINKA), Language.SOUTH_AZERBAIJANI: TranslationLiterals(language=Language.SOUTH_AZERBAIJANI), + Language.SOUTH_LEVANTINE_ARABIC: TranslationLiterals(language=Language.SOUTH_LEVANTINE_ARABIC), Language.SPANISH: TranslationLiterals( language=Language.SPANISH, question_word="pregunta", @@ -814,6 +906,10 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.STANDARD_LATVIAN: TranslationLiterals(language=Language.STANDARD_LATVIAN), + Language.STANDARD_MALAY: TranslationLiterals(language=Language.STANDARD_MALAY), + Language.STANDARD_TIBETAN: TranslationLiterals(language=Language.STANDARD_TIBETAN), + Language.SUNDANESE: TranslationLiterals(language=Language.SUNDANESE), Language.SWAHILI: TranslationLiterals( language=Language.SWAHILI, question_word="swali", @@ -836,6 +932,7 @@ def __getattribute__(self, name: str) -> str: sentence_space=" ", colon=":", ), + Language.SWATI: TranslationLiterals(language=Language.SWATI), Language.SWEDISH: TranslationLiterals( language=Language.SWEDISH, question_word="fråga", @@ -861,6 +958,7 @@ def __getattribute__(self, name: str) -> str: ), Language.TAGALOG: TranslationLiterals(language=Language.TAGALOG), Language.TAJIK: TranslationLiterals(language=Language.TAJIK), + Language.TAMASHEQ: TranslationLiterals(language=Language.TAMASHEQ), Language.TAMIL: TranslationLiterals( # From https://github.com/EleutherAI/lm-evaluation-harness/blob/0845b588303f1f59af98dd1c5bdbd78a9e75a1e2/lm_eval/tasks/xcopa/utils.py language=Language.TAMIL, @@ -890,6 +988,7 @@ def __getattribute__(self, name: str) -> str: colon=":", indices=["А", "Б", "В", "Г", "Д", "Е"], ), + Language.TAIZZI_ADENI_ARABIC: TranslationLiterals(language=Language.TAIZZI_ADENI_ARABIC), Language.TELUGU: TranslationLiterals( language=Language.TELUGU, question_word="ప్రశ్న", @@ -936,6 +1035,13 @@ def __getattribute__(self, name: str) -> str: colon=":", indices=["๑", "๒", "๓", "๔", "๕", "๖", "๗", "๘", "๙", "๐"], ), + Language.TIGRINYA: TranslationLiterals(language=Language.TIGRINYA), + Language.TOK_PISIN: TranslationLiterals(language=Language.TOK_PISIN), + Language.TOSK_ALBANIAN: TranslationLiterals(language=Language.TOSK_ALBANIAN), + Language.TSONGA: TranslationLiterals(language=Language.TSONGA), + Language.TSWANA: TranslationLiterals(language=Language.TSWANA), + Language.TUMBUKA: TranslationLiterals(language=Language.TUMBUKA), + Language.TUNISIAN_ARABIC: TranslationLiterals(language=Language.TUNISIAN_ARABIC), Language.TURKISH: TranslationLiterals( language=Language.TURKISH, question_word="soru", @@ -960,6 +1066,7 @@ def __getattribute__(self, name: str) -> str: colon=":", ), Language.TURKMEN: TranslationLiterals(language=Language.TURKMEN), + Language.TWI: TranslationLiterals(language=Language.TWI), Language.UDMURT: TranslationLiterals( language=Language.UDMURT, question_word="юан", @@ -1007,6 +1114,7 @@ def __getattribute__(self, name: str) -> str: semicolon=";", indices=["А", "Б", "В", "Г", "Д", "Е"], ), + Language.UMBUNDU: TranslationLiterals(language=Language.UMBUNDU), Language.URDU: TranslationLiterals( language=Language.URDU, question_word="سوال", @@ -1030,6 +1138,7 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon="؛", ), + Language.UYGHUR: TranslationLiterals(language=Language.UYGHUR), Language.UZBEK: TranslationLiterals( language=Language.UZBEK, question_word="savol", @@ -1053,6 +1162,7 @@ def __getattribute__(self, name: str) -> str: sentence_space=" ", colon=":", ), + Language.VENETIAN: TranslationLiterals(language=Language.VENETIAN), Language.VIETNAMESE: TranslationLiterals( language=Language.VIETNAMESE, question_word="câu hỏi", @@ -1077,9 +1187,15 @@ def __getattribute__(self, name: str) -> str: semicolon=";", ), Language.WAR: TranslationLiterals(language=Language.WAR), + Language.WARAY: TranslationLiterals(language=Language.WARAY), Language.WELSH: TranslationLiterals(language=Language.WELSH), Language.WESTERN_FRISIAN: TranslationLiterals(language=Language.WESTERN_FRISIAN), + Language.WESTERN_PERSIAN: TranslationLiterals(language=Language.WESTERN_PERSIAN), + Language.WEST_CENTRAL_OROMO: TranslationLiterals(language=Language.WEST_CENTRAL_OROMO), + Language.WOLOF: TranslationLiterals(language=Language.WOLOF), + Language.XHOSA: TranslationLiterals(language=Language.XHOSA), Language.YIDDISH: TranslationLiterals(language=Language.YIDDISH), Language.YORUBA: TranslationLiterals(language=Language.YORUBA), + Language.YUE_CHINESE: TranslationLiterals(language=Language.YUE_CHINESE), Language.ZULU: TranslationLiterals(language=Language.ZULU), } diff --git a/src/lighteval/utils/language.py b/src/lighteval/utils/language.py index d59908b01..f2f98ad37 100644 --- a/src/lighteval/utils/language.py +++ b/src/lighteval/utils/language.py @@ -16,112 +16,227 @@ class Language(Enum): - ENGLISH = "eng" - SPANISH = "spa" - PORTUGUESE = "por" - ITALIAN = "ita" - FRENCH = "fra" - ROMANIAN = "ron" - GERMAN = "deu" - LATIN = "lat" - CZECH = "ces" - DANISH = "dan" - FINNISH = "fin" - GREEK = "ell" - NORWEGIAN = "nor" - POLISH = "pol" - RUSSIAN = "rus" - SLOVENIAN = "slv" - SWEDISH = "swe" - TURKISH = "tur" - DUTCH = "nld" - CHINESE = "zho" - JAPANESE = "jpn" - VIETNAMESE = "vie" - INDONESIAN = "ind" - PERSIAN = "fas" - KOREAN = "kor" + ACEHNESE = "ace" + AFRIKAANS = "afr" + AKAN = "aka" + ALBANIAN = "sqi" + AMHARIC = "amh" ARABIC = "ara" - THAI = "tha" - HINDI = "hin" + ARMENIAN = "hye" + ASSAMESE = "asm" + ASTURIAN = "ast" + AWADHI = "awa" + AYACUCHO_QUECHUA = "quy" + AZERBAIJANI = "aze" + BALINESE = "ban" + BAMBARA = "bam" + BANJAR = "bjn" + BASHKIR = "bak" + BASQUE = "eus" + BELARUSIAN = "bel" + BEMBA = "bem" BENGALI = "ben" - TAMIL = "tam" - HUNGARIAN = "hun" - UKRAINIAN = "ukr" - SLOVAK = "slk" + BHOJPURI = "bho" + BIHARI = "bih" # Deprecated + BOSNIAN = "bos" + BRETON = "bre" + BUGINESE = "bug" BULGARIAN = "bul" + BURMESE = "mya" CATALAN = "cat" + CEBUANO = "ceb" + CENTRAL_ATLAS_TAMAZIGHT = "tzm" + CENTRAL_AYMARA = "ayr" + CENTRAL_KANURI = "knc" + CENTRAL_KURDISH = "ckb" + CHHATTISGARHI = "hne" + CHINESE = "zho" + CHOKWE = "cjk" + CRIMEAN_TATAR = "crh" CROATIAN = "hrv" - SERBIAN = "srp" - LITHUANIAN = "lit" + CZECH = "ces" + DANISH = "dan" + DARI = "prs" + DIVEHI = "div" + DUTCH = "nld" + DYULA = "dyu" + DZONGKHA = "dzo" + EASTERN_PANJABI = "pan" + EASTERN_YIDDISH = "ydd" + EGYPTIAN_ARABIC = "arz" + ENGLISH = "eng" + ESPERANTO = "epo" ESTONIAN = "est" - HEBREW = "heb" - LATVIAN = "lav" - SERBOCROATIAN = "hbs" # Deprecated - ALBANIAN = "sqi" - AZERBAIJANI = "aze" - ICELANDIC = "isl" - MACEDONIAN = "mkd" - GEORGIAN = "kat" + EWE = "ewe" + FAROESE = "fao" + FIJIAN = "fij" + FINNISH = "fin" + FON = "fon" + FRENCH = "fra" + FRIULIAN = "fur" GALICIAN = "glg" - ARMENIAN = "hye" - BASQUE = "eus" - SWAHILI = "swa" - MALAY = "msa" - TAGALOG = "tgl" - JAVANESE = "jav" - PUNJABI = "pan" - BIHARI = "bih" # Deprecated + GANDA = "lug" + GEORGIAN = "kat" + GERMAN = "deu" + GREEK = "ell" + GUARANI = "grn" GUJARATI = "guj" - YORUBA = "yor" - MARATHI = "mar" - URDU = "urd" - AMHARIC = "amh" - TELUGU = "tel" HAITIAN = "hti" - MALAYALAM = "mal" + HAITIAN_CREOLE = "hat" + HALH_MONGOLIAN = "khk" + HAUSA = "hau" + HEBREW = "heb" + HINDI = "hin" + HUNGARIAN = "hun" + ICELANDIC = "isl" + IGBO = "ibo" + ILOCANO = "ilo" + INDONESIAN = "ind" + IRISH = "gle" + ITALIAN = "ita" + JAPANESE = "jpn" + JAVANESE = "jav" + JINGPHO = "kac" + KABIYE = "kbp" + KABUVERDIANU = "kea" + KABYLE = "kab" + KAMBA = "kam" KANNADA = "kan" - NEPALI = "nep" + KASHMIRI = "kas" KAZAKH = "kaz" - BELARUSIAN = "bel" - BURMESE = "mya" - ESPERANTO = "epo" - UZBEK = "uzb" KHMER = "khm" - TAJIK = "tgk" - WELSH = "cym" - NORWEGIAN_NYNORSK = "nno" - BOSNIAN = "bos" - SINHALA = "sin" - TATAR = "tat" - AFRIKAANS = "afr" - ORIYA = "ori" + KIKONGO = "kon" + KIKUYU = "kik" + KIMBUNDU = "kmb" + KINYARWANDA = "kin" KIRGHIZ = "kir" - IRISH = "gle" - OCCITAN = "oci" + KOREAN = "kor" KURDISH = "kur" + KYRGYZ = "kir" LAO = "lao" + LATGALIAN = "ltg" + LATIN = "lat" + LATVIAN = "lav" + LIGURIAN = "lij" + LIMBURGISH = "lim" + LINGALA = "lin" + LITHUANIAN = "lit" + LOMBARD = "lmo" + LUBA_KASAI = "lua" + LUO = "luo" LUXEMBOURGISH = "ltz" - BASHKIR = "bak" - WESTERN_FRISIAN = "fry" - PASHTO = "pus" - MALTESE = "mlt" - BRETON = "bre" - ASSAMESE = "asm" + MACEDONIAN = "mkd" + MAGAHI = "mag" + MAITHILI = "mai" MALAGASY = "mlg" - DIVEHI = "div" - YIDDISH = "yid" - SOMALI = "som" + MALAY = "msa" + MALAYALAM = "mal" + MALTESE = "mlt" + MAORI = "mri" + MARATHI = "mar" + MEITEI = "mni" + MESOPOTAMIAN_ARABIC = "acm" + MINANGKABAU = "min" + MIZO = "lus" + MODERN_STANDARD_ARABIC = "arb" + MOROCCAN_ARABIC = "ary" + MOSSI = "mos" + NAJDI_ARABIC = "ars" + NEPALI = "nep" + NIGERIAN_FULFULDE = "fuv" + NORTHERN_KURDISH = "kmr" + NORTHERN_SOTHO = "nso" + NORTHERN_UZBEK = "uzn" + NORTH_AZERBAIJANI = "azj" + NORTH_LEVANTINE_ARABIC = "apc" + NORWEGIAN = "nor" + NORWEGIAN_BOKMAL = "nob" + NORWEGIAN_NYNORSK = "nno" + NUER = "nus" + NYANJA = "nya" + OCCITAN = "oci" + ODIA = "ory" + ORIYA = "ori" + PANGASINAN = "pag" + PAPIAMENTO = "pap" + PASHTO = "pus" + PERSIAN = "fas" + PLATEAU_MALAGASY = "plt" + POLISH = "pol" + PORTUGUESE = "por" + PUNJABI = "pan" + QUECHUA = "que" + ROMANIAN = "ron" + RUNDI = "run" + RUSSIAN = "rus" + SAMOAN = "smo" + SANGO = "sag" SANSKRIT = "san" + SANTALI = "sat" + SARDINIAN = "srd" + SCOTTISH_GAELIC = "gla" + SERBIAN = "srp" + SERBOCROATIAN = "hbs" # Deprecated + SHAN = "shn" + SHONA = "sna" + SICILIAN = "scn" + SILESIAN = "szl" SINDHI = "snd" - QUECHUA = "que" - TURKMEN = "tuk" - SOUTH_AZERBAIJANI = "azb" + SINHALA = "sin" + SLOVAK = "slk" + SLOVENIAN = "slv" + SOMALI = "som" SORANI = "ckb" - CEBUANO = "ceb" - WAR = "war" - SHAN = "shn" + SOUTHERN_PASHTO = "pbt" + SOUTHERN_SOTHO = "sot" + SOUTHWESTERN_DINKA = "dik" + SOUTH_AZERBAIJANI = "azb" + SOUTH_LEVANTINE_ARABIC = "ajp" + SPANISH = "spa" + STANDARD_LATVIAN = "lvs" + STANDARD_MALAY = "zsm" + STANDARD_TIBETAN = "bod" + SUNDANESE = "sun" + SWAHILI = "swa" + SWATI = "ssw" + SWEDISH = "swe" + TAGALOG = "tgl" + TAJIK = "tgk" + TAMASHEQ = "taq" + TAMIL = "tam" + TATAR = "tat" + TAIZZI_ADENI_ARABIC = "acq" + TELUGU = "tel" + THAI = "tha" + TIGRINYA = "tir" + TOK_PISIN = "tpi" + TOSK_ALBANIAN = "als" + TSONGA = "tso" + TSWANA = "tsn" + TUMBUKA = "tum" + TUNISIAN_ARABIC = "aeb" + TURKISH = "tur" + TURKMEN = "tuk" + TWI = "twi" UDMURT = "udm" + UKRAINIAN = "ukr" + UMBUNDU = "umb" + URDU = "urd" + UYGHUR = "uig" + UZBEK = "uzb" + VENETIAN = "vec" + VIETNAMESE = "vie" + WAR = "war" + WARAY = "war" + WELSH = "cym" + WESTERN_FRISIAN = "fry" + WESTERN_PERSIAN = "pes" + WEST_CENTRAL_OROMO = "gaz" + WOLOF = "wol" + XHOSA = "xho" + YIDDISH = "yid" + YORUBA = "yor" + YUE_CHINESE = "yue" ZULU = "zul" @@ -140,6 +255,15 @@ class Language(Enum): # output[lang_old.pt3] = lang.pt3 # ``` + +def manage_duplicate_language_codes(langcode): + if langcode == "npi": # Nepali + langcode = "nep" + if langcode == "swh": # Swahili + langcode = "swa" + return langcode + + iso_639_3_ind_to_iso_639_3_macro = { "acm": Language.ARABIC, "arz": Language.ARABIC,