Spaces:
Sleeping
Sleeping
File size: 2,926 Bytes
5473c42 e59b0bd a7fd32e 5473c42 e59b0bd 5473c42 0cffe6d a7fd32e 5473c42 0cffe6d 5473c42 0cffe6d e59b0bd 0cffe6d e59b0bd 0cffe6d e59b0bd 0cffe6d e59b0bd 0cffe6d e59b0bd 0cffe6d e59b0bd 0cffe6d e59b0bd 0cffe6d 5473c42 3e7b6ee 5473c42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import gradio as gr
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import phonemizer
import librosa
import math
import io
import base64
from strsimpy.jaro_winkler import JaroWinkler
def speechToPhonemeWS(audioAsB64):
wav_data = base64.b64decode(audioAsB64.encode("utf-8"))
processor = Wav2Vec2Processor.from_pretrained(
"facebook/wav2vec2-xlsr-53-espeak-cv-ft"
)
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
waveform, sample_rate = librosa.load(
io.BytesIO(wav_data), sr=16000
) # Downsample 44.1kHz to 8kHz
input_values = processor(
waveform, sampling_rate=sample_rate, return_tensors="pt"
).input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
speechToPhonemeTranscription = transcription[0]
speechToPhonemeTranscription = speechToPhonemeTranscription.replace(" ", "")
return speechToPhonemeTranscription
def speechToTextToPhonemeWS(audioAsB64):
wav_data = base64.b64decode(audioAsB64.encode("utf-8"))
waveform, sample_rate = librosa.load(
io.BytesIO(wav_data), sr=16000
) # Downsample 44.1kHz to 8kHz
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
input_values = processor(
waveform, sampling_rate=sample_rate, return_tensors="pt"
).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
speechToTextTranscription = processor.batch_decode(predicted_ids)
graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscription[0])
graphemeToPhonemeTranscription = graphemeToPhonemeTranscription.replace(" ", "")
return [speechToTextTranscription[0], graphemeToPhonemeTranscription]
def similarity(S2P, G2P2T):
jarowinkler = JaroWinkler()
similarity_score = jarowinkler.similarity(S2P, G2P2T)
return similarity_score
def similarityScoreToBand(similarity_score):
if similarity_score >= 0.91:
return 9
elif similarity_score >= 0.81:
return 8
elif similarity_score >= 0.73:
return 7
elif similarity_score >= 0.65:
return 6
elif similarity_score >= 0.60:
return 5
elif similarity_score >= 0.46:
return 4
elif similarity_score >= 0.35:
return 3
elif similarity_score >= 0.1:
return 2
else:
return 1
def lark(audioAsB64):
s2p = speechToPhonemeWS(audioAsB64)
[s2t, s2t2p] = speechToTextToPhonemeWS(audioAsB64)
ss = similarity(s2t2p, s2p)
band = similarityScoreToBand(ss)
return [ss, band, s2t]
iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text", "text"])
iface.launch()
|