File size: 2,926 Bytes
5473c42
 
 
 
 
e59b0bd
a7fd32e
5473c42
e59b0bd
 
5473c42
0cffe6d
a7fd32e
5473c42
 
 
 
 
0cffe6d
 
 
5473c42
 
 
 
 
 
 
 
 
0cffe6d
 
 
 
 
 
 
 
e59b0bd
0cffe6d
 
 
 
e59b0bd
 
0cffe6d
e59b0bd
 
 
 
 
 
0cffe6d
e59b0bd
0cffe6d
 
 
e59b0bd
 
0cffe6d
e59b0bd
0cffe6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e59b0bd
0cffe6d
 
 
 
 
 
 
5473c42
 
3e7b6ee
5473c42
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import gradio as gr
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import phonemizer
import librosa
import math
import io
import base64
from strsimpy.jaro_winkler import JaroWinkler


def speechToPhonemeWS(audioAsB64):
    wav_data = base64.b64decode(audioAsB64.encode("utf-8"))
    processor = Wav2Vec2Processor.from_pretrained(
        "facebook/wav2vec2-xlsr-53-espeak-cv-ft"
    )
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")

    waveform, sample_rate = librosa.load(
        io.BytesIO(wav_data), sr=16000
    )  # Downsample 44.1kHz to 8kHz

    input_values = processor(
        waveform, sampling_rate=sample_rate, return_tensors="pt"
    ).input_values

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    speechToPhonemeTranscription = transcription[0]
    speechToPhonemeTranscription = speechToPhonemeTranscription.replace(" ", "")
    return speechToPhonemeTranscription


def speechToTextToPhonemeWS(audioAsB64):
    wav_data = base64.b64decode(audioAsB64.encode("utf-8"))

    waveform, sample_rate = librosa.load(
        io.BytesIO(wav_data), sr=16000
    )  # Downsample 44.1kHz to 8kHz
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

    input_values = processor(
        waveform, sampling_rate=sample_rate, return_tensors="pt"
    ).input_values

    logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    speechToTextTranscription = processor.batch_decode(predicted_ids)

    graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscription[0])
    graphemeToPhonemeTranscription = graphemeToPhonemeTranscription.replace(" ", "")
    return [speechToTextTranscription[0], graphemeToPhonemeTranscription]


def similarity(S2P, G2P2T):
    jarowinkler = JaroWinkler()
    similarity_score = jarowinkler.similarity(S2P, G2P2T)
    return similarity_score


def similarityScoreToBand(similarity_score):
    if similarity_score >= 0.91:
        return 9
    elif similarity_score >= 0.81:
        return 8
    elif similarity_score >= 0.73:
        return 7
    elif similarity_score >= 0.65:
        return 6
    elif similarity_score >= 0.60:
        return 5
    elif similarity_score >= 0.46:
        return 4
    elif similarity_score >= 0.35:
        return 3
    elif similarity_score >= 0.1:
        return 2
    else:
        return 1


def lark(audioAsB64):
    s2p = speechToPhonemeWS(audioAsB64)
    [s2t, s2t2p] = speechToTextToPhonemeWS(audioAsB64)
    ss = similarity(s2t2p, s2p)
    band = similarityScoreToBand(ss)
    return [ss, band, s2t]


iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text", "text"])
iface.launch()