burraco135 commited on
Commit
2c51d44
1 Parent(s): b44b7d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -22
app.py CHANGED
@@ -8,26 +8,30 @@ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Proce
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
  # load speech translation checkpoint
11
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
12
 
13
  # load text-to-speech checkpoint and speaker embeddings
14
- model_id = "burraco135/speecht5_finetuned_voxpopuli_it" # update with your model id
15
  # pipe = pipeline("automatic-speech-recognition", model=model_id)
16
  model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
17
- processor = SpeechT5Processor.from_pretrained(model_id)
18
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
19
- embeddings_dataset = np.load("speaker_0_embeddings.npy")
20
- speaker_embeddings = torch.tensor(embeddings_dataset).unsqueeze(0)
 
 
21
 
22
  replacements = [
23
- ("à", "a"),
24
- ("è", "e"),
25
- ("ì", "i"),
26
- ("í", "i"),
27
- ("ï", "i"),
28
- ("ò", "o"),
29
- ("ó", "o"),
30
- ("ù", "u")
 
 
 
31
  ]
32
 
33
  def cleanup_text(text):
@@ -35,13 +39,19 @@ def cleanup_text(text):
35
  text = text.replace(src, dst)
36
  return text
37
 
 
 
 
 
 
 
38
 
39
- def transcribe_to_german(audio):
40
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "italian"})
41
  return outputs["text"]
42
 
43
 
44
- def synthesise_from_german(text):
45
  text = cleanup_text(text)
46
  inputs = processor(text=text, return_tensors="pt")
47
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
@@ -49,15 +59,15 @@ def synthesise_from_german(text):
49
 
50
 
51
  def speech_to_speech_translation(audio):
52
- translated_text = transcribe_to_german(audio)
53
- synthesised_speech = synthesise_from_german(translated_text)
54
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
55
- return ((16000, synthesised_speech), translated_text)
56
 
57
 
58
  title = "Cascaded STST"
59
  description = """
60
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in German. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and [burraco135/speecht5_finetuned_voxpopuli_it](https://huggingface.co/burraco135/speecht5_finetuned_voxpopuli_it) checkpoint for text-to-speech, which is based on Microsoft's
61
  [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech, fine-tuned in Italian Audio dataset:
62
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
63
  """
@@ -67,7 +77,7 @@ demo = gr.Blocks()
67
  mic_translate = gr.Interface(
68
  fn=speech_to_speech_translation,
69
  inputs=gr.Audio(source="microphone", type="filepath"),
70
- outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Textbox()],
71
  title=title,
72
  description=description,
73
  )
@@ -75,7 +85,7 @@ mic_translate = gr.Interface(
75
  file_translate = gr.Interface(
76
  fn=speech_to_speech_translation,
77
  inputs=gr.Audio(source="upload", type="filepath"),
78
- outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Textbox()],
79
  examples=[["./example.wav"]],
80
  title=title,
81
  description=description,
@@ -84,4 +94,4 @@ file_translate = gr.Interface(
84
  with demo:
85
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
86
 
87
- demo.launch()
 
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
  # load speech translation checkpoint
11
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
12
 
13
  # load text-to-speech checkpoint and speaker embeddings
14
+ model_id = "Sandiago21/speecht5_finetuned_voxpopuli_it" # update with your model id
15
  # pipe = pipeline("automatic-speech-recognition", model=model_id)
16
  model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
 
17
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
18
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
19
+ speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
20
+
21
+ processor = SpeechT5Processor.from_pretrained(model_id)
22
 
23
  replacements = [
24
+ ("á", "a"),
25
+ ("ç", "c"),
26
+ ("è", "e"),
27
+ ("ì", "i"),
28
+ ("í", "i"),
29
+ ("ò", "o"),
30
+ ("ó", "o"),
31
+ ("ù", "u"),
32
+ ("ú", "u"),
33
+ ("š", "s"),
34
+ ("ï", "i"),
35
  ]
36
 
37
  def cleanup_text(text):
 
39
  text = text.replace(src, dst)
40
  return text
41
 
42
+ def synthesize_speech(text):
43
+ text = cleanup_text(text)
44
+ inputs = processor(text=text, return_tensors="pt")
45
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
46
+
47
+ return gr.Audio.update(value=(16000, speech.cpu().numpy()))
48
 
49
+ def translate(audio):
50
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "italian"})
51
  return outputs["text"]
52
 
53
 
54
+ def synthesise(text):
55
  text = cleanup_text(text)
56
  inputs = processor(text=text, return_tensors="pt")
57
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
 
59
 
60
 
61
  def speech_to_speech_translation(audio):
62
+ translated_text = translate(audio)
63
+ synthesised_speech = synthesise(translated_text)
64
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
65
+ return 16000, synthesised_speech
66
 
67
 
68
  title = "Cascaded STST"
69
  description = """
70
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Italian. Demo uses OpenAI's [Whisper Large v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and [Sandiago21/speecht5_finetuned_voxpopuli_it](https://huggingface.co/Sandiago21/speecht5_finetuned_voxpopuli_it) checkpoint for text-to-speech, which is based on Microsoft's
71
  [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech, fine-tuned in Italian Audio dataset:
72
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
73
  """
 
77
  mic_translate = gr.Interface(
78
  fn=speech_to_speech_translation,
79
  inputs=gr.Audio(source="microphone", type="filepath"),
80
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
81
  title=title,
82
  description=description,
83
  )
 
85
  file_translate = gr.Interface(
86
  fn=speech_to_speech_translation,
87
  inputs=gr.Audio(source="upload", type="filepath"),
88
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
89
  examples=[["./example.wav"]],
90
  title=title,
91
  description=description,
 
94
  with demo:
95
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
96
 
97
+ demo.launch()