speech-to-dutch-translation

Build error

File size: 2,694 Bytes

6e8e13a
c13b6ed
 
cb99941
 
6e8e13a
cb99941
6e8e13a
 
 
 
cb99941
 
 
 
6e8e13a
cb99941
 
 
6e8e13a
 
cb99941
 
 
 
 
6e8e13a
 
 
cb99941
 
 
 
 
 
 
 
6e8e13a
cb99941
 
 
 
c13b6ed
cb99941
6e8e13a
 
aa23905
6e8e13a
 
 
 
 
 
 
cb99941
6e8e13a
 
 
 
 
 
 
cb99941
6e8e13a
 
4843ec9
6e8e13a
 
 
 
 
 
 
cb99941

import torch
import numpy as np
import gradio as gr
from transformers import AutoProcessor, SpeechT5ForTextToSpeech, pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, SpeechT5HifiGan
from datasets import load_dataset

device = "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

# load text-to-speech checkpoint
tts_processor = AutoProcessor.from_pretrained("susnato/speecht5_finetuned_voxpopuli_nl")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("susnato/speecht5_finetuned_voxpopuli_nl").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

# load speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


def transcribe(audio):
    outputs = asr_pipe(audio, generate_kwargs={"task": "transcribe",
                                               "language":"nl",
                                               "use_cache":True,
                                               "max_new_tokens":128})
    return outputs["text"]

def synthesise(text):
    inputs = tts_processor(text=text,
                           truncation=True,
                           return_tensors="pt")
    speech = tts_model.generate_speech(inputs["input_ids"].to(device),
                                       speaker_embeddings.to(device),
                                       vocoder=vocoder,
                                       )
    return speech.cpu().numpy()

def speech_to_dutch_translation(audio):
    dutch_text = transcribe(audio)
    speech = synthesise(dutch_text)
    speech = (speech * 32767).astype(np.int16)

    return 16_000, speech


title = "Speech-To-Speech-Translation for Hindi"
description = """
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_dutch_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_dutch_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    # examples=["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch(debug=False)