Ellight commited on
Commit
c318cb1
1 Parent(s): bf0e456

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -36
app.py CHANGED
@@ -1,59 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import numpy as np
3
  import torch
4
  from datasets import load_dataset
5
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
6
 
 
 
7
 
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
  # load speech translation checkpoint
11
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device)
12
 
13
  # load text-to-speech checkpoint and speaker embeddings
14
- model_id = "microsoft/speecht5_tts" #"Ellight/speecht5_finetuned_voxpopuli_nl" # update with your model id
15
- # pipe = pipeline("automatic-speech-recognition", model=model_id)
16
- model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
17
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
18
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation",trust_remote_code=True))
19
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
20
- # speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
21
 
22
- processor = SpeechT5Processor.from_pretrained(model_id)
 
 
23
 
24
- replacements = [
25
- ("à", "a"),
26
- ("ç", "c"),
27
- ("è", "e"),
28
- ("ë", "e"),
29
- ("í", "i"),
30
- ("ï", "i"),
31
- ("ö", "o"),
32
- ("ü", "u"),
33
- ]
34
-
35
- def cleanup_text(text):
36
- for src, dst in replacements:
37
- text = text.replace(src, dst)
38
- return text
39
-
40
- def synthesize_speech(text):
41
- text = cleanup_text(text)
42
- inputs = processor(text=text, return_tensors="pt")
43
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
44
 
45
- return gr.Audio.update(value=(16000, speech.cpu().numpy()))
46
 
47
  def translate(audio):
48
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "Dutch"})
49
  return outputs["text"]
50
 
51
-
52
  def synthesise(text):
53
- text = cleanup_text(text)
54
- inputs = processor(text=text, return_tensors="pt")
55
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
 
 
56
  return speech.cpu()
 
 
 
 
 
57
 
58
 
59
  def speech_to_speech_translation(audio):
 
1
+ # import gradio as gr
2
+ # import numpy as np
3
+ # import torch
4
+ # from datasets import load_dataset
5
+ # from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
6
+
7
+
8
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
+
10
+ # # load speech translation checkpoint
11
+ # asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device)
12
+
13
+ # # load text-to-speech checkpoint and speaker embeddings
14
+ # model_id = "microsoft/speecht5_tts" #"Ellight/speecht5_finetuned_voxpopuli_nl" # update with your model id
15
+ # # pipe = pipeline("automatic-speech-recognition", model=model_id)
16
+ # model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
17
+ # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
18
+ # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation",trust_remote_code=True))
19
+ # speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
20
+ # # speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
21
+
22
+ # processor = SpeechT5Processor.from_pretrained(model_id)
23
+
24
+ # replacements = [
25
+ # ("à", "a"),
26
+ # ("ç", "c"),
27
+ # ("è", "e"),
28
+ # ("ë", "e"),
29
+ # ("í", "i"),
30
+ # ("ï", "i"),
31
+ # ("ö", "o"),
32
+ # ("ü", "u"),
33
+ # ]
34
+
35
+ # def cleanup_text(text):
36
+ # for src, dst in replacements:
37
+ # text = text.replace(src, dst)
38
+ # return text
39
+
40
+ # def synthesize_speech(text):
41
+ # text = cleanup_text(text)
42
+ # inputs = processor(text=text, return_tensors="pt")
43
+ # speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
44
+
45
+ # return gr.Audio.update(value=(16000, speech.cpu().numpy()))
46
+
47
+ # def translate(audio):
48
+ # outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "Dutch"})
49
+ # return outputs["text"]
50
+
51
+
52
+ # def synthesise(text):
53
+ # text = cleanup_text(text)
54
+ # inputs = processor(text=text, return_tensors="pt")
55
+ # speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
56
+ # return speech.cpu()
57
+
58
+
59
+ # def speech_to_speech_translation(audio):
60
+ # translated_text = translate(audio)
61
+ # synthesised_speech = synthesise(translated_text)
62
+ # synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
63
+ # return 16000, synthesised_speech
64
+
65
  import gradio as gr
66
  import numpy as np
67
  import torch
68
  from datasets import load_dataset
 
69
 
70
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
71
+ from transformers import VitsModel, VitsTokenizer
72
 
73
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
74
 
75
  # load speech translation checkpoint
76
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
77
 
78
  # load text-to-speech checkpoint and speaker embeddings
79
+ # processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
80
+ # model = SpeechT5ForTextToSpeech.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl").to(device)
 
 
 
 
 
81
 
82
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
83
+ model = VitsModel.from_pretrained("Matthijs/mms-tts-nld")
84
+ tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-nld")
85
 
86
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
87
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
 
89
 
90
  def translate(audio):
91
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
92
  return outputs["text"]
93
 
 
94
  def synthesise(text):
95
+ inputs = tokenizer(text, return_tensors="pt")
96
+ with torch.no_grad():
97
+ outputs = model(inputs["input_ids"])
98
+ speech = outputs.audio[0]
99
+
100
  return speech.cpu()
101
+
102
+ # def synthesise(text):
103
+ # inputs = processor(text=text, return_tensors="pt", padding='max_length', truncation=True)
104
+ # speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
105
+ # return speech.cpu()
106
 
107
 
108
  def speech_to_speech_translation(audio):