divakaivan commited on
Commit
830a7a9
1 Parent(s): 8bc0d0f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -17
app.py CHANGED
@@ -21,6 +21,109 @@ speaker_embeddings = {
21
  }
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def predict(text, speaker):
25
  if len(text.strip()) == 0:
26
  return (16000, np.zeros(0).astype(np.int16))
@@ -31,23 +134,9 @@ def predict(text, speaker):
31
  input_ids = inputs["input_ids"]
32
  input_ids = input_ids[..., :model.config.max_text_positions]
33
 
34
- if speaker == "Surprise Me!":
35
- # load one of the provided speaker embeddings at random
36
- idx = np.random.randint(len(speaker_embeddings))
37
- key = list(speaker_embeddings.keys())[idx]
38
- speaker_embedding = np.load(speaker_embeddings[key])
39
-
40
- # randomly shuffle the elements
41
- np.random.shuffle(speaker_embedding)
42
-
43
- # randomly flip half the values
44
- x = (np.random.rand(512) >= 0.5) * 1.0
45
- x[x == 0] = -1.0
46
- speaker_embedding *= x
47
-
48
- #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
49
- else:
50
- speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
51
 
52
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
53
 
 
21
  }
22
 
23
 
24
+ from datasets import load_dataset, Audio
25
+
26
+ dataset = load_dataset(
27
+ "divakaivan/glaswegian_audio"
28
+ )
29
+
30
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train']
31
+
32
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
33
+
34
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
35
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
36
+
37
+ tokenizer = processor.tokenizer
38
+
39
+ def extract_all_chars(batch):
40
+ all_text = " ".join(batch["transcription"])
41
+ vocab = list(set(all_text))
42
+ return {"vocab": [vocab], "all_text": [all_text]}
43
+
44
+ vocabs = dataset.map(
45
+ extract_all_chars,
46
+ batched=True,
47
+ batch_size=-1,
48
+ keep_in_memory=True,
49
+ remove_columns=dataset.column_names,
50
+ )
51
+
52
+ dataset_vocab = set(vocabs["vocab"][0])
53
+ tokenizer_vocab = {k for k,_ in tokenizer.get_vocab().items()}
54
+
55
+ replacements = [
56
+ ('à', 'a'),
57
+ ('ç', 'c'),
58
+ ('è', 'e'),
59
+ ('ë', 'e'),
60
+ ('í', 'i'),
61
+ ('ï', 'i'),
62
+ ('ö', 'o'),
63
+ ('ü', 'u'),
64
+ ]
65
+
66
+ def cleanup_text(inputs):
67
+ for src, dst in replacements:
68
+ inputs["transcription"] = inputs["transcription"].replace(src, dst)
69
+ return inputs
70
+
71
+ dataset = dataset.map(cleanup_text)
72
+
73
+ import os
74
+ import torch
75
+ from speechbrain.inference.speaker import EncoderClassifier
76
+
77
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
78
+
79
+ device = "cuda" if torch.cuda.is_available() else "cpu"
80
+ speaker_model = EncoderClassifier.from_hparams(
81
+ source=spk_model_name,
82
+ run_opts={"device": device},
83
+ savedir=os.path.join("/tmp", spk_model_name),
84
+ )
85
+
86
+ def create_speaker_embedding(waveform):
87
+ with torch.no_grad():
88
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
89
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
90
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
91
+ return speaker_embeddings
92
+
93
+
94
+ def prepare_dataset(example):
95
+ # load the audio data; if necessary, this resamples the audio to 16kHz
96
+ audio = example["audio"]
97
+
98
+ # feature extraction and tokenization
99
+ example = processor(
100
+ text=example["transcription"],
101
+ audio_target=audio["array"],
102
+ sampling_rate=audio["sampling_rate"],
103
+ return_attention_mask=False,
104
+ )
105
+
106
+ # strip off the batch dimension
107
+ example["labels"] = example["labels"][0]
108
+
109
+ # use SpeechBrain to obtain x-vector
110
+ example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
111
+
112
+ return example
113
+
114
+ processed_example = prepare_dataset(dataset[0])
115
+ from transformers import SpeechT5HifiGan
116
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
117
+
118
+ spectrogram = torch.tensor(processed_example["labels"])
119
+ with torch.no_grad():
120
+ speech = vocoder(spectrogram)
121
+
122
+ dataset = dataset.map(
123
+ prepare_dataset, remove_columns=dataset.column_names,
124
+ )
125
+
126
+
127
  def predict(text, speaker):
128
  if len(text.strip()) == 0:
129
  return (16000, np.zeros(0).astype(np.int16))
 
134
  input_ids = inputs["input_ids"]
135
  input_ids = input_ids[..., :model.config.max_text_positions]
136
 
137
+ ### ### ###
138
+ example = dataset["test"][11]
139
+ speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
142