AK-12 commited on
Commit
9cb17c0
1 Parent(s): fe4bfeb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -4
app.py CHANGED
@@ -1,7 +1,79 @@
1
  import gradio as gr
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ from datasets import load_dataset
5
 
6
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline, AutoProcessor
 
7
 
8
+
9
+
10
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
+
12
+ # load speech translation checkpoint
13
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
14
+ # Use a pipeline as a high-level helper
15
+ # Load model directly
16
+
17
+ processor = AutoProcessor.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
18
+ model = SpeechT5ForTextToSpeech.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
19
+
20
+ # load text-to-speech checkpoint and speaker embeddings
21
+ # processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
22
+
23
+ # model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
24
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
25
+
26
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
27
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
28
+
29
+
30
+
31
+ def translate(audio):
32
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "fr"})
33
+ return outputs["text"]
34
+
35
+
36
+ def synthesise(text):
37
+ inputs = processor(text=text, return_tensors="pt")
38
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
39
+ return speech.cpu()
40
+
41
+
42
+ def speech_to_speech_translation(audio):
43
+ translated_text = translate(audio)
44
+ synthesised_speech = synthesise(translated_text)
45
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
46
+ return 16000, synthesised_speech
47
+
48
+
49
+ title = "Cascaded STST"
50
+ description = """
51
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in French. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
52
+ [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
53
+ ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
54
+ """
55
+
56
+ demo = gr.Blocks()
57
+
58
+ mic_translate = gr.Interface(
59
+ fn=speech_to_speech_translation,
60
+ inputs=gr.Audio(source="microphone", type="filepath"),
61
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
62
+ examples=[["./example.wav"]],
63
+ title=title,
64
+ description=description,
65
+ )
66
+
67
+ file_translate = gr.Interface(
68
+ fn=speech_to_speech_translation,
69
+ inputs=gr.Audio(source="upload", type="filepath"),
70
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
71
+ examples=[["/content/example.wav"]],
72
+ title=title,
73
+ description=description,
74
+ )
75
+
76
+ with demo:
77
+ gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
78
+
79
+ demo.launch(share=True)