File size: 2,694 Bytes
6e8e13a
c13b6ed
 
cb99941
 
6e8e13a
cb99941
6e8e13a
 
 
 
cb99941
 
 
 
6e8e13a
cb99941
 
 
6e8e13a
 
cb99941
 
 
 
 
6e8e13a
 
 
cb99941
 
 
 
 
 
 
 
6e8e13a
cb99941
 
 
 
c13b6ed
cb99941
6e8e13a
 
aa23905
6e8e13a
 
 
 
 
 
 
cb99941
6e8e13a
 
 
 
 
 
 
cb99941
6e8e13a
 
4843ec9
6e8e13a
 
 
 
 
 
 
cb99941
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch
import numpy as np
import gradio as gr
from transformers import AutoProcessor, SpeechT5ForTextToSpeech, pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, SpeechT5HifiGan
from datasets import load_dataset

device = "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

# load text-to-speech checkpoint
tts_processor = AutoProcessor.from_pretrained("susnato/speecht5_finetuned_voxpopuli_nl")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("susnato/speecht5_finetuned_voxpopuli_nl").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

# load speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


def transcribe(audio):
    outputs = asr_pipe(audio, generate_kwargs={"task": "transcribe",
                                               "language":"nl",
                                               "use_cache":True,
                                               "max_new_tokens":128})
    return outputs["text"]

def synthesise(text):
    inputs = tts_processor(text=text,
                           truncation=True,
                           return_tensors="pt")
    speech = tts_model.generate_speech(inputs["input_ids"].to(device),
                                       speaker_embeddings.to(device),
                                       vocoder=vocoder,
                                       )
    return speech.cpu().numpy()

def speech_to_dutch_translation(audio):
    dutch_text = transcribe(audio)
    speech = synthesise(dutch_text)
    speech = (speech * 32767).astype(np.int16)

    return 16_000, speech


title = "Speech-To-Speech-Translation for Hindi"
description = """
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_dutch_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_dutch_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    # examples=["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch(debug=False)