Spaces:
Runtime error
Runtime error
divakaivan
commited on
Commit
•
830a7a9
1
Parent(s):
8bc0d0f
Update app.py
Browse files
app.py
CHANGED
@@ -21,6 +21,109 @@ speaker_embeddings = {
|
|
21 |
}
|
22 |
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def predict(text, speaker):
|
25 |
if len(text.strip()) == 0:
|
26 |
return (16000, np.zeros(0).astype(np.int16))
|
@@ -31,23 +134,9 @@ def predict(text, speaker):
|
|
31 |
input_ids = inputs["input_ids"]
|
32 |
input_ids = input_ids[..., :model.config.max_text_positions]
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
key = list(speaker_embeddings.keys())[idx]
|
38 |
-
speaker_embedding = np.load(speaker_embeddings[key])
|
39 |
-
|
40 |
-
# randomly shuffle the elements
|
41 |
-
np.random.shuffle(speaker_embedding)
|
42 |
-
|
43 |
-
# randomly flip half the values
|
44 |
-
x = (np.random.rand(512) >= 0.5) * 1.0
|
45 |
-
x[x == 0] = -1.0
|
46 |
-
speaker_embedding *= x
|
47 |
-
|
48 |
-
#speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
|
49 |
-
else:
|
50 |
-
speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
|
51 |
|
52 |
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
|
53 |
|
|
|
21 |
}
|
22 |
|
23 |
|
24 |
+
from datasets import load_dataset, Audio
|
25 |
+
|
26 |
+
dataset = load_dataset(
|
27 |
+
"divakaivan/glaswegian_audio"
|
28 |
+
)
|
29 |
+
|
30 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train']
|
31 |
+
|
32 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
|
33 |
+
|
34 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
35 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
36 |
+
|
37 |
+
tokenizer = processor.tokenizer
|
38 |
+
|
39 |
+
def extract_all_chars(batch):
|
40 |
+
all_text = " ".join(batch["transcription"])
|
41 |
+
vocab = list(set(all_text))
|
42 |
+
return {"vocab": [vocab], "all_text": [all_text]}
|
43 |
+
|
44 |
+
vocabs = dataset.map(
|
45 |
+
extract_all_chars,
|
46 |
+
batched=True,
|
47 |
+
batch_size=-1,
|
48 |
+
keep_in_memory=True,
|
49 |
+
remove_columns=dataset.column_names,
|
50 |
+
)
|
51 |
+
|
52 |
+
dataset_vocab = set(vocabs["vocab"][0])
|
53 |
+
tokenizer_vocab = {k for k,_ in tokenizer.get_vocab().items()}
|
54 |
+
|
55 |
+
replacements = [
|
56 |
+
('à', 'a'),
|
57 |
+
('ç', 'c'),
|
58 |
+
('è', 'e'),
|
59 |
+
('ë', 'e'),
|
60 |
+
('í', 'i'),
|
61 |
+
('ï', 'i'),
|
62 |
+
('ö', 'o'),
|
63 |
+
('ü', 'u'),
|
64 |
+
]
|
65 |
+
|
66 |
+
def cleanup_text(inputs):
|
67 |
+
for src, dst in replacements:
|
68 |
+
inputs["transcription"] = inputs["transcription"].replace(src, dst)
|
69 |
+
return inputs
|
70 |
+
|
71 |
+
dataset = dataset.map(cleanup_text)
|
72 |
+
|
73 |
+
import os
|
74 |
+
import torch
|
75 |
+
from speechbrain.inference.speaker import EncoderClassifier
|
76 |
+
|
77 |
+
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
|
78 |
+
|
79 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
80 |
+
speaker_model = EncoderClassifier.from_hparams(
|
81 |
+
source=spk_model_name,
|
82 |
+
run_opts={"device": device},
|
83 |
+
savedir=os.path.join("/tmp", spk_model_name),
|
84 |
+
)
|
85 |
+
|
86 |
+
def create_speaker_embedding(waveform):
|
87 |
+
with torch.no_grad():
|
88 |
+
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
|
89 |
+
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
|
90 |
+
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
|
91 |
+
return speaker_embeddings
|
92 |
+
|
93 |
+
|
94 |
+
def prepare_dataset(example):
|
95 |
+
# load the audio data; if necessary, this resamples the audio to 16kHz
|
96 |
+
audio = example["audio"]
|
97 |
+
|
98 |
+
# feature extraction and tokenization
|
99 |
+
example = processor(
|
100 |
+
text=example["transcription"],
|
101 |
+
audio_target=audio["array"],
|
102 |
+
sampling_rate=audio["sampling_rate"],
|
103 |
+
return_attention_mask=False,
|
104 |
+
)
|
105 |
+
|
106 |
+
# strip off the batch dimension
|
107 |
+
example["labels"] = example["labels"][0]
|
108 |
+
|
109 |
+
# use SpeechBrain to obtain x-vector
|
110 |
+
example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
|
111 |
+
|
112 |
+
return example
|
113 |
+
|
114 |
+
processed_example = prepare_dataset(dataset[0])
|
115 |
+
from transformers import SpeechT5HifiGan
|
116 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
117 |
+
|
118 |
+
spectrogram = torch.tensor(processed_example["labels"])
|
119 |
+
with torch.no_grad():
|
120 |
+
speech = vocoder(spectrogram)
|
121 |
+
|
122 |
+
dataset = dataset.map(
|
123 |
+
prepare_dataset, remove_columns=dataset.column_names,
|
124 |
+
)
|
125 |
+
|
126 |
+
|
127 |
def predict(text, speaker):
|
128 |
if len(text.strip()) == 0:
|
129 |
return (16000, np.zeros(0).astype(np.int16))
|
|
|
134 |
input_ids = inputs["input_ids"]
|
135 |
input_ids = input_ids[..., :model.config.max_text_positions]
|
136 |
|
137 |
+
### ### ###
|
138 |
+
example = dataset["test"][11]
|
139 |
+
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
|
142 |
|