w11wo commited on
Commit
9839c08
β€’
1 Parent(s): efe3426

Minor Improvements

Browse files
Files changed (3) hide show
  1. .gitignore +3 -0
  2. README.md +2 -2
  3. app.py +90 -83
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ tmp/
2
+ __pycache__
3
+ .DS_Store
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: SpeechlineDemo
3
- emoji: πŸ“‰
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
 
1
  ---
2
+ title: SpeechLine
3
+ emoji: πŸŽ™οΈ
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
app.py CHANGED
@@ -1,115 +1,122 @@
1
- from speechline.transcribers import Wav2Vec2Transcriber
2
- from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter
3
- from speechline.utils.tokenizer import WordTokenizer
4
- from datasets import Dataset, Audio
5
  from pathlib import Path
6
 
7
- import os
8
  import gradio as gr
9
- import shutil
10
  import pandas as pd
 
 
 
 
11
 
12
- max_textboxes=5
 
13
 
14
- def preprocess(audio_path, transcriber):
15
- dataset = Dataset.from_dict({"audio": [audio_path]})
16
- dataset = dataset.cast_column("audio", Audio(sampling_rate=transcriber.sampling_rate))
17
- return dataset
18
 
19
- def transcribe(audio_path, transcriber):
20
- dataset = preprocess(audio_path, transcriber)
21
- output_offsets = transcriber.predict(dataset, output_offsets=True)
22
- return output_offsets
23
-
24
- def segmentation_interface(choice):
25
- if choice == "silence":
26
  return gr.update(visible=True), gr.update(visible=False)
27
- elif choice == "word_overlap":
28
  return gr.update(visible=False), gr.update(visible=True)
29
- else:
30
- return gr.update(visible=False), gr.update(visible=False)
31
-
32
 
33
- def process(audio_path, model, segmentation_type, silence_duration, ground_truth):
34
- output_dir = "./audio_chunks"
35
 
 
36
  transcriber = Wav2Vec2Transcriber(model)
37
- output_offsets = transcribe(audio_path, transcriber)
 
 
 
 
38
 
39
- if segmentation_type == "silence":
40
  segmenter = SilenceSegmenter()
41
- elif segmentation_type == "word_overlap":
42
  segmenter = WordOverlapSegmenter()
43
 
44
  tokenizer = WordTokenizer()
45
 
46
- if os.path.exists(f"{output_dir}/tmp"):
47
- shutil.rmtree(f"{output_dir}/tmp")
48
 
49
  segmenter.chunk_audio_segments(
50
  audio_path,
51
- output_dir,
52
  output_offsets[0],
53
  minimum_chunk_duration=0,
54
  silence_duration=silence_duration,
55
  ground_truth=tokenizer(ground_truth),
56
  )
57
-
58
- outputs = []
59
- idx = 0
60
- for path in sorted(Path(f"{output_dir}/tmp").iterdir()):
61
- if str(path).split('.')[-1] == 'tsv':
62
- gt = pd.read_csv(path, sep='\t', names=["start_offset", "end_offset", "text"])
63
- outputs.append(gr.Dataframe.update(value=gt,visible=True))
64
- idx+=1
65
- if str(path).split('.')[-1] == 'wav':
66
- audio = (str(path))
67
- outputs.append(gr.Audio.update(value=audio, visible=True))
68
-
69
- for i in range(max_textboxes-idx):
70
- outputs.append(gr.Dataframe.update(visible=False))
71
- outputs.append(gr.Audio.update(visible=False))
72
- outputs.append(gr.Column.update(visible=True))
73
  return outputs
74
 
 
75
  with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
76
  with gr.Row():
77
  with gr.Column():
78
  audio = gr.Audio(type="filepath")
79
- radio = gr.Radio(["silence", "word_overlap"], label="Select Segmentation Method", required=True)
80
- model = gr.Dropdown(["facebook/wav2vec2-base-960h", "bookbot/wav2vec-en", "bookbot/wav2vec-id"], value="facebook/wav2vec2-base-960h", label="Select Model")
81
- slider = gr.Slider(0, 100, value=3, step=0.1, label="silence duration", visible=False)
82
- gt = gr.Textbox(label="Ground Truth", placeholder="Enter Ground Truth Text", interactive=True, visible=False)
83
-
84
- radio.change(fn=segmentation_interface, inputs=radio, outputs=[slider, gt])
85
-
86
- inputs = [audio, model, radio, slider, gt]
87
- transcribe_btn = gr.Button("Transcribe")
88
-
89
-
90
- with gr.Column(visible=False) as output_col:
91
- outputs = []
92
- gt1 = gr.Dataframe(visible=False)
93
- audio1 = gr.Audio(visible=False)
94
-
95
- gt2 = gr.Dataframe(visible=False)
96
- audio2 = gr.Audio(visible=False)
97
-
98
- gt3 = gr.Dataframe(visible=False)
99
- audio3 = gr.Audio(visible=False)
100
-
101
- gt4 = gr.Dataframe(visible=False)
102
- audio4 = gr.Audio(visible=False)
103
-
104
- gt5 = gr.Dataframe(visible=False)
105
- audio5 = gr.Audio(visible=False)
106
-
107
-
108
- for i in range(max_textboxes):
109
- outputs.append(gr.Dataframe(visible=False))
110
- outputs.append(gr.Audio(visible=False))
111
- outputs.append(output_col)
112
-
113
- transcribe_btn.click(fn=process, inputs=inputs, outputs=outputs)
114
-
115
- demo.queue().launch()
 
 
 
 
 
1
+ import os
2
+ import shutil
 
 
3
  from pathlib import Path
4
 
 
5
  import gradio as gr
 
6
  import pandas as pd
7
+ from datasets import Audio, Dataset
8
+ from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter
9
+ from speechline.transcribers import Wav2Vec2Transcriber
10
+ from speechline.utils.tokenizer import WordTokenizer
11
 
12
+ MAX_SEGMENTS = 10
13
+ OUTPUT_DIR = "tmp"
14
 
 
 
 
 
15
 
16
+ def segmentation_interface(choice: str):
17
+ if choice == "Silence Gap":
 
 
 
 
 
18
  return gr.update(visible=True), gr.update(visible=False)
19
+ elif choice == "Word Overlap":
20
  return gr.update(visible=False), gr.update(visible=True)
 
 
 
21
 
 
 
22
 
23
+ def run(audio_path, model, segmentation_type, silence_duration, ground_truth):
24
  transcriber = Wav2Vec2Transcriber(model)
25
+ dataset = Dataset.from_dict({"audio": [audio_path]})
26
+ dataset = dataset.cast_column(
27
+ "audio", Audio(sampling_rate=transcriber.sampling_rate)
28
+ )
29
+ output_offsets = transcriber.predict(dataset, output_offsets=True)
30
 
31
+ if segmentation_type == "Silence Gap":
32
  segmenter = SilenceSegmenter()
33
+ elif segmentation_type == "Word Overlap":
34
  segmenter = WordOverlapSegmenter()
35
 
36
  tokenizer = WordTokenizer()
37
 
38
+ if os.path.exists(OUTPUT_DIR):
39
+ shutil.rmtree(OUTPUT_DIR)
40
 
41
  segmenter.chunk_audio_segments(
42
  audio_path,
43
+ OUTPUT_DIR,
44
  output_offsets[0],
45
  minimum_chunk_duration=0,
46
  silence_duration=silence_duration,
47
  ground_truth=tokenizer(ground_truth),
48
  )
49
+
50
+ outputs, idx = [], 0
51
+
52
+ for path in sorted(Path(OUTPUT_DIR).rglob("*")):
53
+ if path.suffix == ".tsv":
54
+ gt = pd.read_csv(
55
+ path, sep="\t", names=["start_offset", "end_offset", "text"]
56
+ )
57
+ outputs.append(gr.Dataframe.update(value=gt, visible=True))
58
+ elif path.suffix == ".wav":
59
+ outputs.append(gr.Audio.update(value=str(path), visible=True))
60
+ idx += 1
61
+
62
+ for _ in range(MAX_SEGMENTS - idx):
63
+ outputs += [gr.Dataframe.update(visible=False), gr.Audio.update(visible=False)]
 
64
  return outputs
65
 
66
+
67
  with gr.Blocks() as demo:
68
+ gr.Markdown(
69
+ f"""
70
+ <center>
71
+
72
+ # πŸŽ™οΈ SpeechLine Demo
73
+ [Repository](https://github.com/bookbot-kids/speechline) | [Documentation](https://bookbot-kids.github.io/speechline/)
74
+
75
+ </center>
76
+ """
77
+ )
78
+
79
  with gr.Row():
80
  with gr.Column():
81
  audio = gr.Audio(type="filepath")
82
+ model = gr.Dropdown(
83
+ choices=[
84
+ "facebook/wav2vec2-base-960h",
85
+ ],
86
+ value="facebook/wav2vec2-base-960h",
87
+ label="Transcriber Model",
88
+ )
89
+ segmenter = gr.Radio(
90
+ choices=["Silence Gap", "Word Overlap"],
91
+ value="Silence Gap",
92
+ label="Segmentation Method",
93
+ )
94
+ sil = gr.Slider(
95
+ 0, 1, value=0.1, step=0.1, label="Silence Duration", visible=True
96
+ )
97
+ gt = gr.Textbox(
98
+ label="Ground Truth",
99
+ placeholder="Enter Ground Truth Text",
100
+ interactive=True,
101
+ visible=False,
102
+ )
103
+
104
+ segmenter.change(
105
+ fn=segmentation_interface, inputs=segmenter, outputs=[sil, gt]
106
+ )
107
+
108
+ inputs = [audio, model, segmenter, sil, gt]
109
+ transcribe_btn = gr.Button("Transcribe")
110
+
111
+ with gr.Column():
112
+ outputs = [
113
+ gr.Dataframe(
114
+ visible=True, headers=["start_offset", "end_offset", "text"]
115
+ ),
116
+ gr.Audio(visible=True),
117
+ ]
118
+ for _ in range(MAX_SEGMENTS - 1):
119
+ outputs += [gr.Dataframe(visible=False), gr.Audio(visible=False)]
120
+ transcribe_btn.click(fn=run, inputs=inputs, outputs=outputs)
121
+
122
+ demo.launch()