seba3y commited on
Commit
0af8dc2
1 Parent(s): 5101797

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +52 -0
  2. model.py +97 -0
  3. requirements.txt +9 -0
  4. wav2vec_aligen.py +51 -0
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from scipy.io import wavfile
3
+ from wav2vec_aligen import speaker_pronunciation_assesment
4
+
5
+
6
+
7
+ def analyze_audio(audio):
8
+ # Write the processed audio to a temporary WAV file
9
+ if audio is None:
10
+ return 'the audio is missing'
11
+ temp_filename = 'temp_audio.wav'
12
+ wavfile.write(temp_filename, audio[0], audio[1])
13
+
14
+
15
+ result = speaker_pronunciation_assesment(temp_filename)
16
+ accuracy_score = result['pronunciation_accuracy']
17
+ fluency_score = result['fluency_score']
18
+ total_score = result['total_score']
19
+ content_scores = result['content_scores']
20
+
21
+ result_markdown = f"""|Language Aspect| Score|
22
+ |---|---|
23
+ |Pronunciation Accuracy| {accuracy_score}|
24
+ |Fluency| {fluency_score}|
25
+ |Total Score| {total_score}|
26
+ |Content Score| {content_scores}|
27
+ """
28
+ return result_markdown
29
+
30
+ import gradio as gr
31
+
32
+ CHOICES = ['Daibers', 'Carbon', 'Reptiles']
33
+
34
+
35
+ def get_paired_text(value):
36
+ text = f'## {value}'
37
+ return text
38
+
39
+ with gr.Blocks() as demo:
40
+ with gr.Row():
41
+ with gr.Column():
42
+ with gr.Row():
43
+ drp_down = gr.Dropdown(choices=CHOICES, scale=2)
44
+ show_text_btn = gr.Button("Select", scale=1)
45
+ read_text = gr.Markdown(label='Listen to speech')
46
+ show_text_btn.click(get_paired_text, inputs=drp_down, outputs=read_text)
47
+ audio_area = gr.Audio(label='Reapet the sentence')
48
+ analyize_audio_btn = gr.Button("Submit", scale=1)
49
+ with gr.Column():
50
+ capt_area = gr.Markdown(label='CAPT Scores')
51
+ analyize_audio_btn.click(analyze_audio, inputs=audio_area, outputs=capt_area)
52
+ demo.launch()
model.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Wav2Vec2BertPreTrainedModel, Wav2Vec2BertModel
2
+ from transformers.modeling_outputs import SequenceClassifierOutput
3
+ from typing import Optional, Tuple, Union
4
+ from torch.nn import MSELoss
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+ class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel):
9
+ # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert
10
+ def __init__(self, config):
11
+ super().__init__(config)
12
+
13
+ if hasattr(config, "add_adapter") and config.add_adapter:
14
+ raise ValueError(
15
+ "Sequence classification does not support the use of Wav2Vec2Bert adapters (config.add_adapter=True)"
16
+ )
17
+ self.wav2vec2_bert = Wav2Vec2BertModel(config)
18
+ num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
19
+ if config.use_weighted_layer_sum:
20
+ self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
21
+ self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
22
+ self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
23
+
24
+ # Initialize weights and apply final processing
25
+ self.post_init()
26
+
27
+ def freeze_base_model(self):
28
+ """
29
+ Calling this function will disable the gradient computation for the base model so that its parameters will not
30
+ be updated during training. Only the classification head will be updated.
31
+ """
32
+ for param in self.wav2vec2_bert.parameters():
33
+ param.requires_grad = False
34
+
35
+
36
+ # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert,WAV_2_VEC_2->WAV2VEC2_BERT, input_values->input_features
37
+ def forward(
38
+ self,
39
+ input_features: Optional[torch.Tensor],
40
+ attention_mask: Optional[torch.Tensor] = None,
41
+ output_attentions: Optional[bool] = None,
42
+ output_hidden_states: Optional[bool] = None,
43
+ return_dict: Optional[bool] = None,
44
+ labels: Optional[torch.Tensor] = None,
45
+ ) -> Union[Tuple, SequenceClassifierOutput]:
46
+ r"""
47
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
48
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
49
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
50
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
51
+ """
52
+
53
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
54
+ output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
55
+
56
+ outputs = self.wav2vec2_bert(
57
+ input_features,
58
+ attention_mask=attention_mask,
59
+ output_attentions=output_attentions,
60
+ output_hidden_states=output_hidden_states,
61
+ return_dict=return_dict,
62
+ )
63
+
64
+ if self.config.use_weighted_layer_sum:
65
+ hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
66
+ hidden_states = torch.stack(hidden_states, dim=1)
67
+ norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
68
+ hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
69
+ else:
70
+ hidden_states = outputs[0]
71
+
72
+ hidden_states = self.projector(hidden_states)
73
+ if attention_mask is None:
74
+ pooled_output = hidden_states.mean(dim=1)
75
+ else:
76
+ padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
77
+ hidden_states[~padding_mask] = 0.0
78
+ pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
79
+
80
+ logits = self.classifier(pooled_output)
81
+ logits = nn.functional.relu(logits)
82
+
83
+ loss = None
84
+ if labels is not None:
85
+ loss_fct = MSELoss()
86
+ loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1, self.config.num_labels))
87
+
88
+ if not return_dict:
89
+ output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
90
+ return ((loss,) + output) if loss is not None else output
91
+
92
+ return SequenceClassifierOutput(
93
+ loss=loss,
94
+ logits=logits,
95
+ hidden_states=outputs.hidden_states,
96
+ attentions=outputs.attentions,
97
+ )
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ wave
2
+ torch
3
+ optimum
4
+ scipy
5
+ numpy
6
+ resampy
7
+ gradio
8
+ librosa
9
+ transformers
wav2vec_aligen.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ import os
4
+ from model import Wav2Vec2BertForSequenceClassification
5
+ from transformers import AutoFeatureExtractor
6
+ # from optimum.bettertransformer import BetterTransformer
7
+
8
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
9
+ # os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
10
+ # os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
11
+ # os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
12
+ torch.random.manual_seed(0);
13
+ # protobuf==3.20.0
14
+
15
+ model_name = "arslanarjumand/wav2vec-repeat"
16
+ processor = AutoFeatureExtractor.from_pretrained(model_name)
17
+ model = Wav2Vec2BertForSequenceClassification.from_pretrained(model_name).to(device)
18
+ # model = BetterTransformer.transform(model)
19
+
20
+ def load_audio(audio_path, processor):
21
+ audio, sr = librosa.load(audio_path, sr=16000)
22
+
23
+ input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
24
+ return input_values
25
+
26
+ @torch.inference_mode()
27
+ def get_emissions(input_values, model):
28
+ results = model(input_values,).logits[0]
29
+ return results
30
+
31
+
32
+ def speaker_pronunciation_assesment(audio_path):
33
+ input_values = load_audio(audio_path, processor)
34
+ result_scores = get_emissions(input_values, model)
35
+
36
+ pronunciation_score = round(result_scores[0].cpu().item())
37
+ fluency_score = round(result_scores[1].cpu().item())
38
+ total_score = round(result_scores[2].cpu().item())
39
+ content_scores = round(result_scores[3].cpu().item())
40
+
41
+
42
+
43
+ result = {'pronunciation_accuracy': pronunciation_score,
44
+ 'content_scores': content_scores,
45
+ 'total_score': total_score,
46
+ 'fluency_score': fluency_score}
47
+ return result
48
+
49
+ if __name__ == '__main__':
50
+ pass
51
+