Davidsamuel101 commited on
Commit
0ab122b
1 Parent(s): a77bcb7

Initial Commit

Browse files
Files changed (2) hide show
  1. app.py +113 -0
  2. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from speechline.transcribers import Wav2Vec2Transcriber
2
+ from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter
3
+ from speechline.utils.tokenizer import WordTokenizer
4
+ from datasets import Dataset, Audio
5
+ from pathlib import Path
6
+
7
+ import gradio as gr
8
+ import shutil
9
+
10
+ max_textboxes=5
11
+
12
+ def preprocess(audio_path, transcriber):
13
+ dataset = Dataset.from_dict({"audio": [audio_path]})
14
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=transcriber.sampling_rate))
15
+ return dataset
16
+
17
+ def transcribe(audio_path, transcriber):
18
+ dataset = preprocess(audio_path, transcriber)
19
+ output_offsets = transcriber.predict(dataset, output_offsets=True)
20
+ return output_offsets
21
+
22
+ def segmentation_interface(choice):
23
+ if choice == "silence":
24
+ return gr.update(visible=True), gr.update(visible=False)
25
+ elif choice == "word_overlap":
26
+ return gr.update(visible=False), gr.update(visible=True)
27
+ else:
28
+ return gr.update(visible=False), gr.update(visible=False)
29
+
30
+
31
+ def process(audio_path, model, segmentation_type, silence_duration, ground_truth):
32
+ output_dir = "./audio_chunks"
33
+
34
+ transcriber = Wav2Vec2Transcriber(model)
35
+ output_offsets = transcribe(audio_path, transcriber)
36
+
37
+ if segmentation_type == "silence":
38
+ segmenter = SilenceSegmenter()
39
+ elif segmentation_type == "word_overlap":
40
+ segmenter = WordOverlapSegmenter()
41
+
42
+ tokenizer = WordTokenizer()
43
+
44
+ if os.path.exists(f"{output_dir}/tmp"):
45
+ shutil.rmtree(f"{output_dir}/tmp")
46
+
47
+ segmenter.chunk_audio_segments(
48
+ audio_path,
49
+ output_dir,
50
+ output_offsets[0],
51
+ minimum_chunk_duration=0,
52
+ silence_duration=silence_duration,
53
+ ground_truth=tokenizer(ground_truth),
54
+ )
55
+
56
+ outputs = []
57
+ idx = 0
58
+ for path in sorted(Path(f"{output_dir}/tmp").iterdir()):
59
+ if str(path).split('.')[-1] == 'tsv':
60
+ gt = pd.read_csv(path, sep='\t', names=["start_offset", "end_offset", "text"])
61
+ outputs.append(gr.Dataframe.update(value=gt,visible=True))
62
+ idx+=1
63
+ if str(path).split('.')[-1] == 'wav':
64
+ audio = (str(path))
65
+ outputs.append(gr.Audio.update(value=audio, visible=True))
66
+
67
+ for i in range(max_textboxes-idx):
68
+ outputs.append(gr.Dataframe.update(visible=False))
69
+ outputs.append(gr.Audio.update(visible=False))
70
+ outputs.append(gr.Column.update(visible=True))
71
+ return outputs
72
+
73
+ with gr.Blocks() as demo:
74
+ with gr.Row():
75
+ with gr.Column():
76
+ audio = gr.Audio(type="filepath")
77
+ radio = gr.Radio(["silence", "word_overlap"], label="Select Segmentation Method", required=True)
78
+ model = gr.Dropdown(["facebook/wav2vec2-base-960h", "bookbot/wav2vec-en", "bookbot/wav2vec-id"], value="facebook/wav2vec2-base-960h", label="Select Model")
79
+ slider = gr.Slider(0, 100, value=3, step=0.1, visible=False)
80
+ gt = gr.Textbox(label="Ground Truth", placeholder="Enter Ground Truth Text", interactive=True, visible=False)
81
+
82
+ radio.change(fn=segmentation_interface, inputs=radio, outputs=[slider, gt])
83
+
84
+ inputs = [audio, model, radio, slider, gt]
85
+ transcribe_btn = gr.Button("Transcribe")
86
+
87
+
88
+ with gr.Column(visible=False) as output_col:
89
+ outputs = []
90
+ gt1 = gr.Dataframe(visible=False)
91
+ audio1 = gr.Audio(visible=False)
92
+
93
+ gt2 = gr.Dataframe(visible=False)
94
+ audio2 = gr.Audio(visible=False)
95
+
96
+ gt3 = gr.Dataframe(visible=False)
97
+ audio3 = gr.Audio(visible=False)
98
+
99
+ gt4 = gr.Dataframe(visible=False)
100
+ audio4 = gr.Audio(visible=False)
101
+
102
+ gt5 = gr.Dataframe(visible=False)
103
+ audio5 = gr.Audio(visible=False)
104
+
105
+
106
+ for i in range(max_textboxes):
107
+ outputs.append(gr.Dataframe(visible=False))
108
+ outputs.append(gr.Audio(visible=False))
109
+ outputs.append(output_col)
110
+
111
+ transcribe_btn.click(fn=process, inputs=inputs, outputs=outputs)
112
+
113
+ demo.queue().launch()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ speechline @ git+https://github.com/bookbot-kids/speechline.git