File size: 730 Bytes
d2ed505 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
def language_identification(texts):
text = [
texts,
]
model_ckpt = "papluca/xlm-roberta-base-language-detection"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
preds = torch.softmax(logits, dim=-1)
# Map raw predictions to languages
id2lang = model.config.id2label
vals, idxs = torch.max(preds, dim=1)
lang_dict = {id2lang[k.item()]: v.item() for k, v in zip(idxs, vals)}
return lang_dict |