Spaces:
Sleeping
Sleeping
File size: 1,876 Bytes
78f7e42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
from ast import literal_eval
def make_lang_list(row):
languages = row["languages"]
if languages == "none":
return []
return literal_eval(languages)
def language_count(row):
return len(row["languages"])
def process_for_lang(data, modality):
# Filter by modality
if modality == "NLP":
data = data[data["modality"] == "nlp"]
elif modality == "Audio":
data = data[data["modality"] == "audio"]
elif modality == "Multimodal":
data = data[data["modality"] == "multimodal"]
# Remove rows without languages
data.loc[data.languages == "False", 'languages'] = None
data.loc[data.languages == {}, 'languages'] = None
# Count of rows that have no languages
no_lang_count = data["languages"].isna().sum()
# As the languages column might have multiple languages,
# we need to convert it to a list. We then count the number of languages.
data["languages"] = data["languages"].fillna('none')
data["languages"] = data.apply(make_lang_list, axis=1)
data["language_count"] = data.apply(language_count, axis=1)
# Just keep the models with at least one language
models_with_langs = data[data["language_count"] > 0]
langs = models_with_langs["languages"].explode()
langs = langs[langs != {}]
total_langs = len(langs.unique())
data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)
return data, no_lang_count, total_langs, langs.unique()
def filter_multilinguality(data, linguality):
if linguality == "Just Multilingual":
multilingual_tag = data["multilingual"] == 1
multiple_lang_tags = data["language_count"] > 1
return data[multilingual_tag | multiple_lang_tags]
elif linguality == "Three or more languages":
return data[data["language_count"] >= 3]
else:
return data |