import time import librosa import torch from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor from transformers import set_seed def identify_language(fp:str) -> str: ''' For given audio file, identify what language it uses. Parameters ---------- fp: str The file path to the audio file. Returns ---------- detected_lang:str The iso3 code of the detected language. ''' # Ensure replicability set_seed(555) start_time = time.time() # Load language ID model model_id = "facebook/mms-lid-256" # Need to find the appropriate model for the language -- 256 languages is the first that contains MOS processor = AutoFeatureExtractor.from_pretrained(model_id) model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id) # Process the audio signal, sampling_rate = librosa.load(fp, sr=16000) inputs = processor(signal, sampling_rate=16_000, return_tensors="pt") # Inference with torch.no_grad(): outputs = model(**inputs).logits lang_id = torch.argmax(outputs, dim=-1)[0].item() detected_lang = model.config.id2label[lang_id] print("Time elapsed: ", int(time.time() - start_time), " seconds") return detected_lang