File size: 2,772 Bytes
aa7cb02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import streamlit as st
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import av
import numpy as np
import pydub
from io import BytesIO
from models.nllb import nllb
from models.parakeet import parakeet_ctc_model
from stream_VAD import stream
from models.es_fastconformer import stt_es_model

RTC_CONFIGURATION = RTCConfiguration({"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]})

# Load models once
model_nllb, tokenizer_nllb = nllb()
parakeet = parakeet_ctc_model()
stt_model = stt_es_model()

def process_audio(audio_chunk, language):
    # Convert audio chunk to pydub.AudioSegment
    audio_segment = pydub.AudioSegment(
        data=audio_chunk.tobytes(),
        sample_width=audio_chunk.format.sample_width,
        frame_rate=audio_chunk.sample_rate,
        channels=len(audio_chunk.layout.channels)
    )

    # Process audio based on selected language
    if language == "en":
        processed_audio = stream(parakeet, model_nllb, tokenizer_nllb, "english", "spanish", audio_segment)
    elif language == "es":
        processed_audio = stream(stt_model, model_nllb, tokenizer_nllb, "spanish", "english", audio_segment)
    else:
        return audio_chunk

    # Convert processed audio back to numpy array
    processed_audio_np = np.array(processed_audio.get_array_of_samples())

    return processed_audio.frame_rate, processed_audio_np

def audio_callback(frame: av.AudioFrame, language):
    audio_data = frame.to_ndarray()
    audio_chunk = av.AudioFrame.from_ndarray(audio_data, format="s16", layout="mono")
    return process_audio(audio_chunk, language)

st.title("Real-Time Audio Processing")

language = st.radio("Select Language", ["en", "es"], index=0)

webrtc_ctx = webrtc_streamer(
    key="audio",
    mode=WebRtcMode.SENDRECV,
    rtc_configuration=RTC_CONFIGURATION,
    media_stream_constraints={"audio": True, "video": False},
    audio_receiver_size=256,
    async_processing=True,
)

if webrtc_ctx.audio_receiver:
    webrtc_ctx.audio_receiver.on("data", lambda frame: audio_callback(frame, language))

if "audio_buffer" not in st.session_state:
    st.session_state["audio_buffer"] = BytesIO()

if webrtc_ctx.audio_receiver:
    audio_frames = webrtc_ctx.audio_receiver.get_frames()

    for frame in audio_frames:
        processed_audio_rate, processed_audio_np = audio_callback(frame, language)

        audio_segment = pydub.AudioSegment(
            data=processed_audio_np.tobytes(),
            sample_width=processed_audio_np.dtype.itemsize,
            frame_rate=processed_audio_rate,
            channels=1
        )
        st.session_state["audio_buffer"].write(audio_segment.export(format="wav").read())

    st.audio(st.session_state["audio_buffer"].getvalue(), format="audio/wav")