File size: 3,240 Bytes
8aed886
 
 
 
 
c5cb4be
8aed886
 
 
c5cb4be
8aed886
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5cb4be
8aed886
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5cb4be
8aed886
 
 
 
 
 
 
 
 
 
 
c5cb4be
8aed886
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5cb4be
8aed886
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import time

import gradio as gr
import librosa
import numpy as np
# import soundfile as sf
from transformers import pipeline

TARGET_SAMPLE_RATE = 16_000
AUDIO_SECONDS_THRESHOLD = 2
pipe = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593")
prediction = [{"score": 1, "label": "recording..."}]


def normalize_waveform(waveform, datatype=np.float32):  # source datatype: np.int16
    waveform = waveform.astype(dtype=datatype)
    waveform /= 32768.0
    return waveform


def streaming_recording_fn(stream, new_chunk):
    global prediction
    sr, y = new_chunk
    y = normalize_waveform(y)
    y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SAMPLE_RATE)
    if stream is not None:
        if (stream.shape[-1] / TARGET_SAMPLE_RATE) >= AUDIO_SECONDS_THRESHOLD:
            prediction = pipe(stream)
            file_name = f'./audio/{time.strftime("%Y%m%d_%H%M%S", time.localtime())}.wav'
            # # sf.write(file_name, stream, TARGET_SAMPLE_RATE)
            print(f"SAVE AUDIO: {file_name}")
            print(f">>>>>>1\t{y.shape=}, {stream.shape=}\n\t{prediction[0]=}")
            stream = None
        else:
            stream = np.concatenate([stream, y], axis=-1)
            print(f">>>>>>2\t{y.shape=}, {stream.shape=}")
    else:
        stream = y
        print(f">>>>>>3\t{y.shape=}, {stream.shape=}")

    return stream, {i['label']: i['score'] for i in prediction}


def microphone_fn(waveform):
    print('-' * 120)
    print(f"{waveform=}")
    sr, y = waveform
    y = normalize_waveform(y)
    y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SAMPLE_RATE)
    result = pipe(y)
    file_name = f'./audio/{time.strftime("%Y%m%d_%H%M%S", time.localtime())}.wav'
    # sf.write(file_name, y, TARGET_SAMPLE_RATE)
    return {i['label']: i['score'] for i in result}


def file_fn(waveform):
    print('-' * 120)
    print(f"{waveform=}")
    sr, y = waveform
    y = normalize_waveform(y)
    y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SAMPLE_RATE)
    result = pipe(y)
    file_name = f'./audio/{time.strftime("%Y%m%d_%H%M%S", time.localtime())}.wav'
    # sf.write(file_name, y, TARGET_SAMPLE_RATE)
    return {i['label']: i['score'] for i in result}


streaming_demo = gr.Interface(
    fn=streaming_recording_fn,
    inputs=["state", gr.Audio(sources=["microphone"], streaming=True)],
    outputs=["state", "label"],
    live=True,
)

microphone_demo = gr.Interface(
    fn=microphone_fn,
    inputs=[gr.Audio(sources=["microphone"], type="numpy")],
    outputs=["label"]
)

file_demo = gr.Interface(
    fn=file_fn,
    inputs=[gr.Audio(sources=["upload"], type="numpy")],
    outputs=["label"]
)

with gr.Blocks() as example:
    inputs = [gr.Audio(sources=["upload"], type="numpy")]
    output = gr.Label()

    examples = [
        ["audio/cantina.wav"],
        ["audio/cat.mp3"]
    ]
    ex = gr.Examples(examples,
                     fn=file_fn, inputs=inputs, outputs=output,
                     run_on_click=True)

with gr.Blocks() as demo:
    gr.TabbedInterface([file_demo, streaming_demo, microphone_demo, example],
                       ["Audio file", "Streaming", "Microphone", "Example"])

if __name__ == "__main__":

    demo.launch(share=True)