Spaces:
Paused
Paused
Adding backend/ folder to HF space
Browse filesAdding all source files in backend folder to space
- backend/.DS_Store +0 -0
- backend/.env +1 -0
- backend/Client.py +74 -0
- backend/__pycache__/Client.cpython-310.pyc +0 -0
- backend/__pycache__/main.cpython-310.pyc +0 -0
- backend/backend.log +0 -0
- backend/main.py +338 -0
- backend/models/Seamless/vad_s2st_sc_24khz_main.yaml +25 -0
- backend/models/SeamlessStreaming/vad_s2st_sc_main.yaml +21 -0
- backend/mongodb/endpoints/__pycache__/calls.cpython-310.pyc +0 -0
- backend/mongodb/endpoints/__pycache__/users.cpython-310.pyc +0 -0
- backend/mongodb/endpoints/calls.py +53 -0
- backend/mongodb/endpoints/users.py +37 -0
- backend/mongodb/models/__pycache__/calls.cpython-310.pyc +0 -0
- backend/mongodb/models/__pycache__/users.cpython-310.pyc +0 -0
- backend/mongodb/models/calls.py +71 -0
- backend/mongodb/models/users.py +46 -0
- backend/mongodb/operations/__pycache__/calls.cpython-310.pyc +0 -0
- backend/mongodb/operations/__pycache__/users.cpython-310.pyc +0 -0
- backend/mongodb/operations/calls.py +128 -0
- backend/mongodb/operations/users.py +77 -0
- backend/pcmToWav.py +34 -0
- backend/preprocess_wav.py +65 -0
- backend/requirements.txt +36 -0
- backend/routes/__init__.py +1 -0
- backend/routes/__pycache__/__init__.cpython-310.pyc +0 -0
- backend/routes/__pycache__/routing.cpython-310.pyc +0 -0
- backend/routes/routing.py +9 -0
- backend/seamless/__init__.py +0 -0
- backend/seamless/__pycache__/__init__.cpython-310.pyc +0 -0
- backend/seamless/__pycache__/room.cpython-310.pyc +0 -0
- backend/seamless/__pycache__/simuleval_agent_directory.cpython-310.pyc +0 -0
- backend/seamless/__pycache__/simuleval_transcoder.cpython-310.pyc +0 -0
- backend/seamless/__pycache__/speech_and_text_output.cpython-310.pyc +0 -0
- backend/seamless/__pycache__/transcoder_helpers.cpython-310.pyc +0 -0
- backend/seamless/room.py +64 -0
- backend/seamless/simuleval_agent_directory.py +171 -0
- backend/seamless/simuleval_transcoder.py +428 -0
- backend/seamless/speech_and_text_output.py +15 -0
- backend/seamless/transcoder_helpers.py +43 -0
- backend/seamless_utils.py +210 -0
- backend/src/models/__pycache__/__init__.cpython-310.pyc +0 -0
- backend/src/testing/test.py +4 -0
- backend/src/utility/__pycache__/utility.cpython-310.pyc +0 -0
backend/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
backend/.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
MONGODB_URI=mongodb+srv://benjolo:[email protected]/?retryWrites=true&w=majority&appName=IT-Cluster1
|
backend/Client.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Tuple
|
2 |
+
import wave
|
3 |
+
import os
|
4 |
+
|
5 |
+
import torchaudio
|
6 |
+
from vad import EnergyVAD
|
7 |
+
TARGET_SAMPLING_RATE = 16000
|
8 |
+
|
9 |
+
def create_frames(data: bytes, frame_duration: int) -> Tuple[bytes]:
|
10 |
+
frame_size = int(TARGET_SAMPLING_RATE * (frame_duration / 1000))
|
11 |
+
return (data[i:i + frame_size] for i in range(0, len(data), frame_size)), frame_size
|
12 |
+
|
13 |
+
def detect_activity(energies: list):
|
14 |
+
if sum(energies) < len(energies) / 12:
|
15 |
+
return False
|
16 |
+
count = 0
|
17 |
+
for energy in energies:
|
18 |
+
if energy == 1:
|
19 |
+
count += 1
|
20 |
+
if count == 12:
|
21 |
+
return True
|
22 |
+
else:
|
23 |
+
count = 0
|
24 |
+
return False
|
25 |
+
|
26 |
+
class Client:
|
27 |
+
def __init__(self, sid, client_id, call_id=None):
|
28 |
+
self.sid = sid
|
29 |
+
self.client_id = client_id
|
30 |
+
self.call_id = call_id
|
31 |
+
self.buffer = bytearray()
|
32 |
+
self.output_path = self.sid + "_output_audio.wav"
|
33 |
+
self.target_language = None
|
34 |
+
self.original_sr = None
|
35 |
+
self.vad = EnergyVAD(
|
36 |
+
sample_rate=TARGET_SAMPLING_RATE,
|
37 |
+
frame_length=25,
|
38 |
+
frame_shift=20,
|
39 |
+
energy_threshold=0.05,
|
40 |
+
pre_emphasis=0.95,
|
41 |
+
) # PM - Default values given in the docs for this class
|
42 |
+
|
43 |
+
def add_bytes(self, new_bytes):
|
44 |
+
self.buffer += new_bytes
|
45 |
+
|
46 |
+
def resample_and_write_to_file(self):
|
47 |
+
print("Audio being written to file....\n")
|
48 |
+
with wave.open(self.sid + "_OG.wav", "wb") as wf:
|
49 |
+
wf.setnchannels(1)
|
50 |
+
wf.setsampwidth(2)
|
51 |
+
wf.setframerate(self.original_sr)
|
52 |
+
wf.setnframes(0)
|
53 |
+
wf.setcomptype("NONE", "not compressed")
|
54 |
+
wf.writeframes(self.buffer)
|
55 |
+
waveform, sample_rate = torchaudio.load(self.sid + "_OG.wav")
|
56 |
+
resampler = torchaudio.transforms.Resample(sample_rate, TARGET_SAMPLING_RATE, dtype=waveform.dtype)
|
57 |
+
resampled_waveform = resampler(waveform)
|
58 |
+
# torchaudio.save(self.output_path, resampled_waveform, TARGET_SAMPLING_RATE)
|
59 |
+
vad_waveform = self.vad(resampled_waveform)
|
60 |
+
# print(vad_waveform) # debugging
|
61 |
+
self.buffer = bytearray()
|
62 |
+
return detect_activity(vad_waveform), resampled_waveform
|
63 |
+
|
64 |
+
def get_length(self):
|
65 |
+
return len(self.buffer)
|
66 |
+
|
67 |
+
def __del__(self):
|
68 |
+
if len(self.buffer) > 0:
|
69 |
+
print(f"🚨 [ClientAudioBuffer] Buffer not empty for {self.sid} ({len(self.buffer)} bytes)!")
|
70 |
+
if os.path.exists(self.output_path):
|
71 |
+
os.remove(self.output_path)
|
72 |
+
if os.path.exists(self.sid + "_OG.wav"):
|
73 |
+
os.remove(self.sid + "_OG.wav")
|
74 |
+
|
backend/__pycache__/Client.cpython-310.pyc
ADDED
Binary file (2.99 kB). View file
|
|
backend/__pycache__/main.cpython-310.pyc
ADDED
Binary file (8.55 kB). View file
|
|
backend/backend.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
backend/main.py
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from operator import itemgetter
|
2 |
+
import os
|
3 |
+
from datetime import datetime
|
4 |
+
import uvicorn
|
5 |
+
from typing import Any, Optional, Tuple, Dict, TypedDict
|
6 |
+
from urllib import parse
|
7 |
+
from uuid import uuid4
|
8 |
+
import logging
|
9 |
+
###############################################
|
10 |
+
# Configure logger
|
11 |
+
###############################################
|
12 |
+
logging.basicConfig(filename="backend.log",
|
13 |
+
filemode='w',
|
14 |
+
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
|
15 |
+
datefmt='%H:%M:%S',
|
16 |
+
level=logging.DEBUG)
|
17 |
+
|
18 |
+
logger = logging.getLogger("socketio_server_pubsub")
|
19 |
+
logger.propagate = True
|
20 |
+
|
21 |
+
import sys
|
22 |
+
# sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/')
|
23 |
+
|
24 |
+
from fastapi import FastAPI
|
25 |
+
from fastapi.middleware.cors import CORSMiddleware
|
26 |
+
from pymongo import MongoClient
|
27 |
+
from dotenv import dotenv_values
|
28 |
+
from routes import router as api_router
|
29 |
+
from contextlib import asynccontextmanager
|
30 |
+
import requests
|
31 |
+
|
32 |
+
from typing import List
|
33 |
+
from datetime import date
|
34 |
+
from mongodb.operations.calls import *
|
35 |
+
from mongodb.models.calls import UserCall, UpdateCall
|
36 |
+
# from mongodb.endpoints.calls import *
|
37 |
+
|
38 |
+
from transformers import AutoProcessor, SeamlessM4Tv2Model
|
39 |
+
|
40 |
+
# from seamless_communication.inference import Translator
|
41 |
+
from Client import Client
|
42 |
+
#----------------------------------
|
43 |
+
# base seamless imports
|
44 |
+
# ---------------------------------
|
45 |
+
import numpy as np
|
46 |
+
import torch
|
47 |
+
# ---------------------------------
|
48 |
+
import socketio
|
49 |
+
|
50 |
+
DEBUG = True
|
51 |
+
|
52 |
+
ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME = "remove_server_lock"
|
53 |
+
|
54 |
+
TARGET_SAMPLING_RATE = 16000
|
55 |
+
MAX_BYTES_BUFFER = 480_000
|
56 |
+
|
57 |
+
print("")
|
58 |
+
print("")
|
59 |
+
print("=" * 20 + " ⭐️ Starting Server... ⭐️ " + "=" * 20)
|
60 |
+
|
61 |
+
###############################################
|
62 |
+
# Configure socketio server
|
63 |
+
###############################################
|
64 |
+
|
65 |
+
# TODO PM - change this to the actual path
|
66 |
+
# seamless remnant code
|
67 |
+
CLIENT_BUILD_PATH = "../streaming-react-app/dist/"
|
68 |
+
static_files = {
|
69 |
+
"/": CLIENT_BUILD_PATH,
|
70 |
+
"/assets/seamless-db6a2555.svg": {
|
71 |
+
"filename": CLIENT_BUILD_PATH + "assets/seamless-db6a2555.svg",
|
72 |
+
"content_type": "image/svg+xml",
|
73 |
+
},
|
74 |
+
}
|
75 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
76 |
+
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
|
77 |
+
# PM - hardcoding temporarily as my GPU doesnt have enough vram
|
78 |
+
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large").to("cpu")
|
79 |
+
|
80 |
+
config = dotenv_values(".env")
|
81 |
+
|
82 |
+
# Read connection string from environment vars
|
83 |
+
uri = os.environ['MONGODB_URI']
|
84 |
+
|
85 |
+
# Read connection string from .env file
|
86 |
+
# uri = config['MONGODB_URI']
|
87 |
+
|
88 |
+
# MongoDB Connection Lifespan Events
|
89 |
+
@asynccontextmanager
|
90 |
+
async def lifespan(app: FastAPI):
|
91 |
+
# startup logic
|
92 |
+
app.mongodb_client = MongoClient(uri)
|
93 |
+
app.database = app.mongodb_client['IT-Cluster1'] #connect to interpretalk primary db
|
94 |
+
try:
|
95 |
+
app.mongodb_client.admin.command('ping')
|
96 |
+
print("MongoDB Connection Established...")
|
97 |
+
except Exception as e:
|
98 |
+
print(e)
|
99 |
+
|
100 |
+
yield
|
101 |
+
|
102 |
+
# shutdown logic
|
103 |
+
print("Closing MongoDB Connection...")
|
104 |
+
app.mongodb_client.close()
|
105 |
+
|
106 |
+
app = FastAPI(lifespan=lifespan, logger=logger)
|
107 |
+
|
108 |
+
# New CORS funcitonality
|
109 |
+
app.add_middleware(
|
110 |
+
CORSMiddleware,
|
111 |
+
allow_origins=["*"], # configured node app port
|
112 |
+
allow_credentials=True,
|
113 |
+
allow_methods=["*"],
|
114 |
+
allow_headers=["*"],
|
115 |
+
)
|
116 |
+
|
117 |
+
app.include_router(api_router) # include routers for user, calls and transcripts operations
|
118 |
+
|
119 |
+
|
120 |
+
# sio is the main socket.io entrypoint
|
121 |
+
sio = socketio.AsyncServer(
|
122 |
+
async_mode="asgi",
|
123 |
+
cors_allowed_origins="*",
|
124 |
+
logger=logger,
|
125 |
+
engineio_logger=logger,
|
126 |
+
)
|
127 |
+
# sio.logger.setLevel(logging.DEBUG)
|
128 |
+
socketio_app = socketio.ASGIApp(sio)
|
129 |
+
# app.mount("/", socketio_app)
|
130 |
+
|
131 |
+
from fastapi import APIRouter, Body, Request, status
|
132 |
+
|
133 |
+
bytes_data = bytearray()
|
134 |
+
model_name = "seamlessM4T_v2_large"
|
135 |
+
vocoder_name = "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"
|
136 |
+
|
137 |
+
clients = {}
|
138 |
+
rooms = {}
|
139 |
+
|
140 |
+
|
141 |
+
def get_collection_users():
|
142 |
+
return app.database["user_records"]
|
143 |
+
|
144 |
+
def get_collection_calls():
|
145 |
+
# return app.database["call_records"]
|
146 |
+
return app.database["call_test"]
|
147 |
+
|
148 |
+
|
149 |
+
@app.get("/test/", response_description="List all existing call records", response_model=List[UserCall])
|
150 |
+
def test():
|
151 |
+
|
152 |
+
result = list_calls(get_collection_calls(), 100)
|
153 |
+
|
154 |
+
# return {"message": "Welcome to InterpreTalk!"}
|
155 |
+
|
156 |
+
print(result)
|
157 |
+
return result
|
158 |
+
|
159 |
+
|
160 |
+
@app.put("/test_put/", response_description="List all existing call records", response_model=UserCall)
|
161 |
+
def test_put():
|
162 |
+
|
163 |
+
# result = list_calls(get_collection_calls(), 100)
|
164 |
+
# result = send_captions("TEST", "TEST", "TEST", "oUjUxTYTQFVVjEarIcZ0")
|
165 |
+
result = send_captions("TEST", "TEST", "TEST", "TESTID000001")
|
166 |
+
|
167 |
+
print(result)
|
168 |
+
return result
|
169 |
+
|
170 |
+
|
171 |
+
@app.post("/test_post/", response_description="List all existing call records", response_model=UserCall)
|
172 |
+
def test_post():
|
173 |
+
request_data = {
|
174 |
+
"call_id": "TESTID000001"
|
175 |
+
}
|
176 |
+
|
177 |
+
result = create_calls(get_collection_calls(), request_data)
|
178 |
+
|
179 |
+
# return {"message": "Welcome to InterpreTalk!"}
|
180 |
+
return result
|
181 |
+
|
182 |
+
|
183 |
+
async def send_translated_text(client_id, original_text, translated_text, room_id):
|
184 |
+
print('SEND_TRANSLATED_TEXT IS WOKRING IN FASTAPI BACKEND...')
|
185 |
+
print(rooms)
|
186 |
+
print(clients)
|
187 |
+
|
188 |
+
data = {
|
189 |
+
"author": str(client_id),
|
190 |
+
"original_text": str(original_text),
|
191 |
+
"translated_text": str(translated_text),
|
192 |
+
"timestamp": str(datetime.now())
|
193 |
+
}
|
194 |
+
logger.warning("SENDING TRANSLATED TEXT TO CLIENT")
|
195 |
+
await sio.emit("translated_text", data, room=room_id)
|
196 |
+
logger.warning("SUCCESSFULLY SEND AUDIO TO FRONTEND")
|
197 |
+
|
198 |
+
@sio.on("connect")
|
199 |
+
async def connect(sid, environ):
|
200 |
+
print(f"📥 [event: connected] sid={sid}")
|
201 |
+
query_params = dict(parse.parse_qsl(environ["QUERY_STRING"]))
|
202 |
+
client_id = query_params.get("client_id")
|
203 |
+
logger.info(f"📥 [event: connected] sid={sid}, client_id={client_id}")
|
204 |
+
# sid = socketid, client_id = client specific ID ,always the same for same user
|
205 |
+
clients[sid] = Client(sid, client_id)
|
206 |
+
logger.warning(f"Client connected: {sid}")
|
207 |
+
logger.warning(clients)
|
208 |
+
|
209 |
+
@sio.on("disconnect")
|
210 |
+
async def disconnect(sid): # BO - also pass call id as parameter for updating MongoDB
|
211 |
+
logger.debug(f"📤 [event: disconnected] sid={sid}")
|
212 |
+
clients.pop(sid, None)
|
213 |
+
# BO -> Update Call record with call duration, key terms
|
214 |
+
|
215 |
+
@sio.on("target_language")
|
216 |
+
async def target_language(sid, target_lang):
|
217 |
+
logger.info(f"📥 [event: target_language] sid={sid}, target_lang={target_lang}")
|
218 |
+
clients[sid].target_language = target_lang
|
219 |
+
|
220 |
+
@sio.on("call_user")
|
221 |
+
async def call_user(sid, call_id):
|
222 |
+
clients[sid].call_id = call_id
|
223 |
+
logger.warning(f"CALL {sid}: entering room {call_id}")
|
224 |
+
rooms[call_id] = rooms.get(call_id, [])
|
225 |
+
if sid not in rooms[call_id] and len(rooms[call_id]) < 2:
|
226 |
+
rooms[call_id].append(sid)
|
227 |
+
sio.enter_room(sid, call_id)
|
228 |
+
else:
|
229 |
+
logger.warning(f"CALL {sid}: room {call_id} is full")
|
230 |
+
# await sio.emit("room_full", room=call_id, to=sid)
|
231 |
+
|
232 |
+
# # BO - Get call id from dictionary created during socketio connection
|
233 |
+
# client_id = clients[sid].client_id
|
234 |
+
|
235 |
+
# logger.warning(f"NOW TRYING TO CREATE DB RECORD FOR Caller with ID: {client_id} for call: {call_id}")
|
236 |
+
# # # BO -> Create Call Record with Caller and call_id field (None for callee, duration, terms..)
|
237 |
+
# request_data = {
|
238 |
+
# "call_id": str(call_id),
|
239 |
+
# "caller_id": str(client_id),
|
240 |
+
# "creation_date": str(datetime.now())
|
241 |
+
# }
|
242 |
+
|
243 |
+
# response = create_calls(get_collection_calls(), request_data)
|
244 |
+
# print(response) # BO - print created db call record
|
245 |
+
|
246 |
+
@sio.on("audio_config")
|
247 |
+
async def audio_config(sid, sample_rate):
|
248 |
+
clients[sid].original_sr = sample_rate
|
249 |
+
|
250 |
+
|
251 |
+
@sio.on("answer_call")
|
252 |
+
async def answer_call(sid, call_id):
|
253 |
+
|
254 |
+
clients[sid].call_id = call_id
|
255 |
+
logger.warning(f"ANSWER {sid}: entering room {call_id}")
|
256 |
+
rooms[call_id] = rooms.get(call_id, [])
|
257 |
+
if sid not in rooms[call_id] and len(rooms[call_id]) < 2:
|
258 |
+
rooms[call_id].append(sid)
|
259 |
+
sio.enter_room(sid, call_id)
|
260 |
+
else:
|
261 |
+
logger.warning(f"ANSWER {sid}: room {call_id} is full")
|
262 |
+
# await sio.emit("room_full", room=call_id, to=sid)
|
263 |
+
|
264 |
+
|
265 |
+
# # BO - Get call id from dictionary created during socketio connection
|
266 |
+
# client_id = clients[sid].client_id
|
267 |
+
|
268 |
+
# # BO -> Update Call Record with Callee field based on call_id
|
269 |
+
# logger.warning(f"NOW UPDATING MongoDB RECORD FOR Caller with ID: {client_id} for call: {call_id}")
|
270 |
+
# # # BO -> Create Call Record with callee_id field (None for callee, duration, terms..)
|
271 |
+
# request_data = {
|
272 |
+
# "callee_id": client_id
|
273 |
+
# }
|
274 |
+
|
275 |
+
# response = update_calls(get_collection_calls(), call_id, request_data)
|
276 |
+
# print(response) # BO - print created db call record
|
277 |
+
|
278 |
+
|
279 |
+
@sio.on("incoming_audio")
|
280 |
+
async def incoming_audio(sid, data, call_id):
|
281 |
+
try:
|
282 |
+
clients[sid].add_bytes(data)
|
283 |
+
|
284 |
+
if clients[sid].get_length() >= MAX_BYTES_BUFFER:
|
285 |
+
logger.warning('Buffer full, now outputting...')
|
286 |
+
output_path = clients[sid].output_path
|
287 |
+
vad_result, resampled_audio = clients[sid].resample_and_write_to_file()
|
288 |
+
# source lang is speakers tgt language 😃
|
289 |
+
src_lang = clients[sid].target_language
|
290 |
+
if vad_result:
|
291 |
+
logger.warning('Speech detected, now processing audio.....')
|
292 |
+
tgt_sid = next(id for id in rooms[call_id] if id != sid)
|
293 |
+
tgt_lang = clients[tgt_sid].target_language
|
294 |
+
# following example from https://github.com/facebookresearch/seamless_communication/blob/main/docs/m4t/README.md#transformers-usage
|
295 |
+
output_tokens = processor(audios=resampled_audio, src_lang=src_lang, return_tensors="pt")
|
296 |
+
model_output = model.generate(**output_tokens, tgt_lang=src_lang, generate_speech=False)[0].tolist()[0]
|
297 |
+
asr_text = processor.decode(model_output, skip_special_tokens=True)
|
298 |
+
print(f"ASR TEXT = {asr_text}")
|
299 |
+
# ASR TEXT => ORIGINAL TEXT
|
300 |
+
|
301 |
+
t2t_tokens = processor(text=asr_text, src_lang=src_lang, tgt_lang=tgt_lang, return_tensors="pt")
|
302 |
+
print(f"FIRST TYPE = {type(output_tokens)}, SECOND TYPE = {type(t2t_tokens)}")
|
303 |
+
translated_data = model.generate(**t2t_tokens, tgt_lang=tgt_lang, generate_speech=False)[0].tolist()[0]
|
304 |
+
translated_text = processor.decode(translated_data, skip_special_tokens=True)
|
305 |
+
print(f"TRANSLATED TEXT = {translated_text}")
|
306 |
+
|
307 |
+
# BO -> send translated_text to mongodb as caption record update based on call_id
|
308 |
+
# send_captions(clients[sid].client_id, asr_text, translated_text, call_id)
|
309 |
+
|
310 |
+
# TRANSLATED TEXT
|
311 |
+
# PM - text_output is a list with 1 string
|
312 |
+
await send_translated_text(clients[sid].client_id, asr_text, translated_text, call_id)
|
313 |
+
|
314 |
+
# # BO -> send translated_text to mongodb as caption record update based on call_id
|
315 |
+
# send_captions(clients[sid].client_id, asr_text, translated_text, call_id)
|
316 |
+
|
317 |
+
except Exception as e:
|
318 |
+
logger.error(f"Error in incoming_audio: {e.with_traceback()}")
|
319 |
+
|
320 |
+
def send_captions(client_id, original_text, translated_text, call_id):
|
321 |
+
# BO -> Update Call Record with Callee field based on call_id
|
322 |
+
print(f"Now updating Caption field in call record for Caller with ID: {client_id} for call: {call_id}")
|
323 |
+
|
324 |
+
data = {
|
325 |
+
"author": str(client_id),
|
326 |
+
"original_text": str(original_text),
|
327 |
+
"translated_text": str(translated_text),
|
328 |
+
"timestamp": str(datetime.now())
|
329 |
+
}
|
330 |
+
|
331 |
+
response = update_captions(get_collection_calls(), call_id, data)
|
332 |
+
return response
|
333 |
+
|
334 |
+
app.mount("/", socketio_app)
|
335 |
+
|
336 |
+
if __name__ == '__main__':
|
337 |
+
uvicorn.run("main:app", host='127.0.0.1', port=8080, log_level="info")
|
338 |
+
|
backend/models/Seamless/vad_s2st_sc_24khz_main.yaml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
agent_class: seamless_communication.streaming.agents.seamless_s2st.SeamlessS2STDualVocoderVADAgent
|
2 |
+
monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
|
3 |
+
unity_model_name: seamless_streaming_unity
|
4 |
+
sentencepiece_model: spm_256k_nllb100.model
|
5 |
+
|
6 |
+
task: s2st
|
7 |
+
tgt_lang: "eng"
|
8 |
+
min_unit_chunk_size: 50
|
9 |
+
decision_threshold: 0.7
|
10 |
+
no_early_stop: True
|
11 |
+
block_ngrams: True
|
12 |
+
vocoder_name: vocoder_v2
|
13 |
+
expr_vocoder_name: vocoder_pretssel
|
14 |
+
gated_model_dir: .
|
15 |
+
expr_vocoder_gain: 3.0
|
16 |
+
upstream_idx: 1
|
17 |
+
wav2vec_yaml: wav2vec.yaml
|
18 |
+
min_starting_wait_w2vbert: 192
|
19 |
+
|
20 |
+
config_yaml: cfg_fbank_u2t.yaml
|
21 |
+
upstream_idx: 1
|
22 |
+
detokenize_only: True
|
23 |
+
device: cuda:0
|
24 |
+
max_len_a: 0
|
25 |
+
max_len_b: 1000
|
backend/models/SeamlessStreaming/vad_s2st_sc_main.yaml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
agent_class: seamless_communication.streaming.agents.seamless_streaming_s2st.SeamlessStreamingS2STJointVADAgent
|
2 |
+
monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
|
3 |
+
unity_model_name: seamless_streaming_unity
|
4 |
+
sentencepiece_model: spm_256k_nllb100.model
|
5 |
+
|
6 |
+
task: s2st
|
7 |
+
tgt_lang: "eng"
|
8 |
+
min_unit_chunk_size: 50
|
9 |
+
decision_threshold: 0.7
|
10 |
+
no_early_stop: True
|
11 |
+
block_ngrams: True
|
12 |
+
vocoder_name: vocoder_v2
|
13 |
+
wav2vec_yaml: wav2vec.yaml
|
14 |
+
min_starting_wait_w2vbert: 192
|
15 |
+
|
16 |
+
config_yaml: cfg_fbank_u2t.yaml
|
17 |
+
upstream_idx: 1
|
18 |
+
detokenize_only: True
|
19 |
+
device: cuda:0
|
20 |
+
max_len_a: 0
|
21 |
+
max_len_b: 1000
|
backend/mongodb/endpoints/__pycache__/calls.cpython-310.pyc
ADDED
Binary file (2.9 kB). View file
|
|
backend/mongodb/endpoints/__pycache__/users.cpython-310.pyc
ADDED
Binary file (1.94 kB). View file
|
|
backend/mongodb/endpoints/calls.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, Body, Request, status
|
2 |
+
from typing import List
|
3 |
+
from datetime import date
|
4 |
+
|
5 |
+
import sys
|
6 |
+
# sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
|
7 |
+
|
8 |
+
from ..operations import calls as calls
|
9 |
+
from ..models.calls import UserCall, UpdateCall
|
10 |
+
|
11 |
+
router = APIRouter(prefix="/call",
|
12 |
+
tags=["Calls"])
|
13 |
+
|
14 |
+
@router.post("/create-call", response_description="Create a new user call record", status_code=status.HTTP_201_CREATED, response_model=UserCall)
|
15 |
+
async def create_calls(request: Request, user_calls: UserCall = Body(...)):
|
16 |
+
return calls.create_calls(request, user_calls)
|
17 |
+
|
18 |
+
@router.get("/list-call", response_description="List all existing call records", response_model=List[UserCall])
|
19 |
+
async def list_calls(request: Request, limit: int):
|
20 |
+
return calls.list_calls(request, 100)
|
21 |
+
|
22 |
+
@router.get("/find-call/{call_id}", response_description="Find user's calls based on User ID", response_model=UserCall)
|
23 |
+
async def find_calls(request: Request, call_id: str):
|
24 |
+
return calls.find_calls(request, call_id)
|
25 |
+
|
26 |
+
@router.get("/find-user/{user_id}", response_description="Find user's calls based on User ID", response_model=List[UserCall])
|
27 |
+
async def find_user_calls(request: Request, user_id: str):
|
28 |
+
return calls.find_user_calls(request, user_id)
|
29 |
+
|
30 |
+
'''Key terms list can have variable length -> using POST request over GET'''
|
31 |
+
@router.post("/find-term/", response_description="Find calls based on key term list", response_model=List[UserCall])
|
32 |
+
async def list_transcripts_by_key_terms(request: Request, key_terms: List[str]):
|
33 |
+
return calls.list_transcripts_by_key_terms(request, key_terms)
|
34 |
+
|
35 |
+
@router.get("/find-date/{start_date}/{end_date}", response_description="Find calls based on date ranges", response_model=List[UserCall])
|
36 |
+
async def list_transcripts_by_dates(request: Request, start_date: str, end_date: str):
|
37 |
+
return calls.list_transcripts_by_dates(request, start_date, end_date)
|
38 |
+
|
39 |
+
@router.get("/find-duration/{min_len}/{max_len}", response_description="Find calls based on call duration in minutes", response_model=List[UserCall])
|
40 |
+
async def list_transcripts_by_duration(request: Request, min_len: int, max_len: int):
|
41 |
+
return calls.list_transcripts_by_duration(request, min_len, max_len)
|
42 |
+
|
43 |
+
@router.put("/update-call/{call_id}", response_description="Update an existing call", response_model=UpdateCall)
|
44 |
+
async def update_calls(request: Request, call_id: str, user_calls: UpdateCall = Body(...)):
|
45 |
+
return calls.update_calls(request, call_id, user_calls)
|
46 |
+
|
47 |
+
@router.put("/update-captions/{call_id}", response_description="Update an existing call", response_model=UpdateCall)
|
48 |
+
async def update_captions(request: Request, call_id: str, user_calls: UpdateCall = Body(...)):
|
49 |
+
return calls.update_captions(request, call_id, user_calls)
|
50 |
+
|
51 |
+
@router.delete("/delete-call/{call_id}", response_description="Delete a call by its id")
|
52 |
+
async def delete_call(request: Request, call_id :str):
|
53 |
+
return calls.delete_calls(request, call_id)
|
backend/mongodb/endpoints/users.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, Body, Request, status
|
2 |
+
from typing import List
|
3 |
+
import sys
|
4 |
+
# sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
|
5 |
+
from ..models.users import User, UpdateUser
|
6 |
+
from ..operations import users as users
|
7 |
+
|
8 |
+
router = APIRouter(prefix="/user",
|
9 |
+
tags=["User"])
|
10 |
+
|
11 |
+
@router.post("/", response_description="Create a new user", status_code=status.HTTP_201_CREATED, response_model=User)
|
12 |
+
async def create_user(request: Request, user: User = Body(...)):
|
13 |
+
return users.create_user(request,user)
|
14 |
+
|
15 |
+
@router.get("/", response_description="List users", response_model=List[User])
|
16 |
+
async def list_users(request: Request):
|
17 |
+
return users.list_users(request, 100)
|
18 |
+
|
19 |
+
@router.put("/{user_id}", response_description="Update a User", response_model=UpdateUser)
|
20 |
+
async def update_user(request: Request, user_id: str, user: UpdateUser = Body(...)):
|
21 |
+
return users.update_user(request, user_id, user)
|
22 |
+
|
23 |
+
@router.get("/{user_id}", response_description="Get a single user by id", response_model=User)
|
24 |
+
async def find_user(request: Request, user_id: str):
|
25 |
+
return users.find_user(request, user_id)
|
26 |
+
|
27 |
+
@router.get("/name/{user_name}", response_description="Get a single user by name", response_model=User)
|
28 |
+
async def find_user_name(request: Request, name: str):
|
29 |
+
return users.find_user_name(request, name)
|
30 |
+
|
31 |
+
@router.get("/email/{email_addr}", response_description="Get a single user by email", response_model=User)
|
32 |
+
async def find_user_email(request: Request, email: str):
|
33 |
+
return users.find_user_email(request, email)
|
34 |
+
|
35 |
+
@router.delete("/{user_id}", response_description="Delete a user")
|
36 |
+
async def delete_user(request: Request, user_id:str):
|
37 |
+
return users.delete_user(request, user_id)
|
backend/mongodb/models/__pycache__/calls.cpython-310.pyc
ADDED
Binary file (2.51 kB). View file
|
|
backend/mongodb/models/__pycache__/users.cpython-310.pyc
ADDED
Binary file (1.83 kB). View file
|
|
backend/mongodb/models/calls.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uuid
|
2 |
+
from typing import List, Dict, Optional
|
3 |
+
from datetime import datetime
|
4 |
+
from pydantic import BaseModel, Field, PrivateAttr
|
5 |
+
|
6 |
+
import sys
|
7 |
+
# sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
|
8 |
+
|
9 |
+
# from pydantic_mongo import ObjectIdField
|
10 |
+
|
11 |
+
|
12 |
+
''' Class for storing captions generated by SeamlessM4T'''
|
13 |
+
class UserCaptions(BaseModel):
|
14 |
+
_id: uuid.UUID = PrivateAttr(default_factory=uuid.uuid4) # private attr not included in http calls
|
15 |
+
author: str
|
16 |
+
original_text: str
|
17 |
+
translated_text: str
|
18 |
+
timestamp: datetime = Field(default_factory=datetime.now)
|
19 |
+
|
20 |
+
|
21 |
+
'''Class for storing past call records from users'''
|
22 |
+
class UserCall(BaseModel):
|
23 |
+
_id: uuid.UUID = PrivateAttr(default_factory=uuid.uuid4)
|
24 |
+
call_id: Optional[str] = None
|
25 |
+
caller_id: Optional[str] = None
|
26 |
+
callee_id: Optional[str] = None
|
27 |
+
creation_date: datetime = Field(default_factory=datetime.now, alias="date")
|
28 |
+
duration: Optional[int] = None # milliseconds
|
29 |
+
captions: Optional[List[UserCaptions]] = None
|
30 |
+
# captions: List[Dict[str, str | float]]
|
31 |
+
key_terms: Optional[List[str]] = None
|
32 |
+
|
33 |
+
'''Implement validation check on transcript ID if in transcript records'''
|
34 |
+
# @validator('transcript_id')
|
35 |
+
# def check_transcript_id_transcripts_collection(cls,transcript_id):
|
36 |
+
# transcript_ids=['',''] # check transcript_ids in collection
|
37 |
+
# if transcript_id not in transcript_ids:
|
38 |
+
# raise ValueError(f'transcript ID must be in {transcript_ids}')
|
39 |
+
# return transcript_id
|
40 |
+
|
41 |
+
class Config:
|
42 |
+
populate_by_name = True
|
43 |
+
json_schema_extra = {
|
44 |
+
"example": {
|
45 |
+
"call_id": "65eef930e9abd3b1e3506906",
|
46 |
+
"caller_id": "65ede65b6d246e52aaba9d4f",
|
47 |
+
"callee_id": "65edda944340ac84c1f00758",
|
48 |
+
"duration": 360,
|
49 |
+
"captions": [{"author": "shamzino", "original_text": "eng: This is original_text english text", "translated_text": "spa: este es el texto traducido al español", "timestamp": "2024-03-28T16:15:50.956055"},
|
50 |
+
{"author": "benjino", "original_text": "eng: This is source english text", "translated_text": "spa: este es el texto fuente al español", "timestamp": "2024-03-28T16:16:20.34625"}],
|
51 |
+
"key_terms": ["original_text", "source", "english", "text"]
|
52 |
+
}
|
53 |
+
}
|
54 |
+
|
55 |
+
|
56 |
+
''' Class for updating User Call record'''
|
57 |
+
class UpdateCall(BaseModel):
|
58 |
+
call_id: Optional[str] = None
|
59 |
+
caller_id: Optional[str] = None
|
60 |
+
callee_id: Optional[str] = None
|
61 |
+
duration: Optional[int] = None
|
62 |
+
captions: Optional[List[UserCaptions]] = None
|
63 |
+
key_terms: Optional[List[str]] = None
|
64 |
+
|
65 |
+
class Config:
|
66 |
+
populate_by_name = True
|
67 |
+
json_schema_extra = {
|
68 |
+
"example": {
|
69 |
+
"duration": "500"
|
70 |
+
}
|
71 |
+
}
|
backend/mongodb/models/users.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uuid
|
2 |
+
from typing import List, Optional
|
3 |
+
from pydantic import BaseModel, Field, SecretStr, PrivateAttr
|
4 |
+
from pydantic.networks import EmailStr
|
5 |
+
|
6 |
+
from pydantic_mongo import ObjectIdField
|
7 |
+
|
8 |
+
'''Class for user model used to relate users to past calls'''
|
9 |
+
class User(BaseModel):
|
10 |
+
_id: uuid.UUID = PrivateAttr(default_factory=uuid.uuid4) # private attr not included in http calls
|
11 |
+
user_id: str
|
12 |
+
name: str
|
13 |
+
email: EmailStr = Field(unique=True, index=True)
|
14 |
+
password: SecretStr
|
15 |
+
call_ids: Optional[List[str]] = None
|
16 |
+
|
17 |
+
class Config:
|
18 |
+
populate_by_name = True
|
19 |
+
json_schema_extra = {
|
20 |
+
"example": {
|
21 |
+
"user_id": "65ede65b6d246e52aaba9d4f",
|
22 |
+
"name": "benjolo",
|
23 |
+
"email": "[email protected]",
|
24 |
+
"password": "therealbenjolo",
|
25 |
+
"call_ids": ["65e205ced1be3a22854ff300", "65df8c3eba9c7c2ed1b20e85"]
|
26 |
+
}
|
27 |
+
}
|
28 |
+
|
29 |
+
'''Class for updating user records'''
|
30 |
+
class UpdateUser(BaseModel):
|
31 |
+
user_id: Optional[str] = None
|
32 |
+
name: Optional[str] = None
|
33 |
+
email: Optional[EmailStr] = None
|
34 |
+
''' To decode use -> SecretStr("abc").get_secret_value()'''
|
35 |
+
# password: Optional[SecretStr]
|
36 |
+
call_ids: Optional[List[str]] = None
|
37 |
+
|
38 |
+
class Config:
|
39 |
+
populate_by_name = True
|
40 |
+
json_schema_extra = {
|
41 |
+
"example": {
|
42 |
+
"email": "[email protected]",
|
43 |
+
"call_ids": ["65e205ced1be3a22854ff300", "65df8c3eba9c7c2ed1b20e85", "65eef930e9abd3b1e3506906"]
|
44 |
+
}
|
45 |
+
}
|
46 |
+
|
backend/mongodb/operations/__pycache__/calls.cpython-310.pyc
ADDED
Binary file (4.26 kB). View file
|
|
backend/mongodb/operations/__pycache__/users.cpython-310.pyc
ADDED
Binary file (2.98 kB). View file
|
|
backend/mongodb/operations/calls.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import Body, Request, HTTPException, status
|
2 |
+
from fastapi.encoders import jsonable_encoder
|
3 |
+
import sys
|
4 |
+
# sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
|
5 |
+
from ..models.calls import UpdateCall, UserCall, UserCaptions
|
6 |
+
|
7 |
+
def get_collection_calls(request: Request):
|
8 |
+
try:
|
9 |
+
# return request.app.database["call_records"]
|
10 |
+
return request.app.database["call_test"]
|
11 |
+
except:
|
12 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Unable to find call records Database.")
|
13 |
+
|
14 |
+
|
15 |
+
def create_calls(collection, user: UserCall = Body(...)):
|
16 |
+
calls = jsonable_encoder(user)
|
17 |
+
# new_calls = get_collection_calls(request).insert_one(calls)
|
18 |
+
new_calls = collection.insert_one(calls)
|
19 |
+
# created_calls = get_collection_calls(request).find_one({"_id": new_calls.inserted_id})
|
20 |
+
created_calls = collection.find_one({"_id": new_calls.inserted_id})
|
21 |
+
|
22 |
+
return created_calls
|
23 |
+
|
24 |
+
|
25 |
+
def list_calls(request: Request, limit: int):
|
26 |
+
try:
|
27 |
+
calls = list(get_collection_calls(request).find(limit = limit))
|
28 |
+
# dateTest = calls[2]['date']
|
29 |
+
return calls
|
30 |
+
except:
|
31 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No existing call records yet.")
|
32 |
+
|
33 |
+
|
34 |
+
'''Finding calls based on call id'''
|
35 |
+
def find_calls(request: Request, call_id: str):
|
36 |
+
# if user_calls := get_collection_calls(request).find_one({"call_id": call_id}) is not None:
|
37 |
+
user_calls = get_collection_calls(request).find_one({"call_id": call_id})
|
38 |
+
if user_calls is not None:
|
39 |
+
return user_calls
|
40 |
+
else:
|
41 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with ID: '{call_id}' not found.")
|
42 |
+
|
43 |
+
|
44 |
+
'''Finding calls based on user id'''
|
45 |
+
def find_user_calls(request: Request, user_id: str):
|
46 |
+
user_calls = list(get_collection_calls(request).find({"$or": [{"caller_id": user_id}, {"callee_id": user_id}]})) # match on caller or callee ID
|
47 |
+
if len(user_calls):
|
48 |
+
return user_calls
|
49 |
+
else:
|
50 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with ID: '{user_id}' has no calls yet.")
|
51 |
+
|
52 |
+
|
53 |
+
'''Finding calls based on key terms list'''
|
54 |
+
def list_transcripts_by_key_terms(request: Request, key_terms_list: list[str] = Body(...)):
|
55 |
+
key_terms_list = jsonable_encoder(key_terms_list)
|
56 |
+
|
57 |
+
call_records = list(get_collection_calls(request).find({"key_terms": {"$in": key_terms_list}}, {'_id': 0})) # exclude returning ObjectID in find()
|
58 |
+
|
59 |
+
# Check if any call records were returned
|
60 |
+
if len(call_records):
|
61 |
+
return call_records
|
62 |
+
else:
|
63 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with key terms: '{key_terms_list}' not found!")
|
64 |
+
|
65 |
+
|
66 |
+
'''Finding calls based on date ranges'''
|
67 |
+
def list_transcripts_by_dates(request: Request, start_date: str, end_date: str):
|
68 |
+
print(start_date, end_date)
|
69 |
+
# Convert strings to date string in YYYY-MM-ddT00:00:00 format
|
70 |
+
start_date = f'{start_date}T00:00:00'
|
71 |
+
end_date = f'{end_date}T00:00:00'
|
72 |
+
|
73 |
+
call_records = list(get_collection_calls(request).find({"date":{"$gte": start_date, "$lte": end_date}}))
|
74 |
+
|
75 |
+
if len(call_records):
|
76 |
+
return call_records
|
77 |
+
else:
|
78 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with creation date between: '{start_date} - {end_date}' not found!")
|
79 |
+
|
80 |
+
|
81 |
+
'''Finding calls based on call lengths'''
|
82 |
+
def list_transcripts_by_duration(request: Request, min_len: int, max_len: int):
|
83 |
+
|
84 |
+
call_records = list(get_collection_calls(request).find({"duration":{"$gte": min_len, "$lte": max_len}}))
|
85 |
+
|
86 |
+
if len(call_records):
|
87 |
+
return call_records
|
88 |
+
else:
|
89 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with duration between: '{min_len} - {max_len}' milliseconds not found!")
|
90 |
+
|
91 |
+
|
92 |
+
def update_calls(collection, call_id: str, calls: UpdateCall = Body(...)):
|
93 |
+
# calls = {k: v for k, v in calls.model_dump().items() if v is not None} #loop in the dict
|
94 |
+
calls = {k: v for k, v in calls.items() if v is not None} #loop in the dict
|
95 |
+
if len(calls) >= 1:
|
96 |
+
update_result = collection.update_one({"call_id": call_id}, {"$set": calls})
|
97 |
+
|
98 |
+
if update_result.modified_count == 0:
|
99 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not updated!")
|
100 |
+
|
101 |
+
if (existing_item := collection.find_one({"call_id": call_id})) is not None:
|
102 |
+
return existing_item
|
103 |
+
|
104 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not found!")
|
105 |
+
|
106 |
+
def update_captions(collection, call_id: str, calls: UserCaptions = Body(...)):
|
107 |
+
# calls = {k: v for k, v in calls.model_dump().items() if v is not None}
|
108 |
+
calls = {k: v for k, v in calls.items() if v is not None}
|
109 |
+
if len(calls) >= 1:
|
110 |
+
update_result = collection.update_one({"call_id": call_id},
|
111 |
+
{"$push": {"captions": calls}})
|
112 |
+
|
113 |
+
if update_result.modified_count == 0:
|
114 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Captions not updated!")
|
115 |
+
|
116 |
+
if (existing_item := collection.find_one({"call_id": call_id})) is not None:
|
117 |
+
return existing_item
|
118 |
+
|
119 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Captions not found!")
|
120 |
+
|
121 |
+
|
122 |
+
def delete_calls(request: Request, call_id: str):
|
123 |
+
deleted_calls = get_collection_calls(request).delete_one({"call_id": call_id})
|
124 |
+
|
125 |
+
if deleted_calls.deleted_count == 1:
|
126 |
+
return f"Call deleted sucessfully!"
|
127 |
+
|
128 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not found!")
|
backend/mongodb/operations/users.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import Body, Request, HTTPException, status
|
2 |
+
from fastapi.encoders import jsonable_encoder
|
3 |
+
import sys
|
4 |
+
# sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
|
5 |
+
from ..models.users import User, UpdateUser
|
6 |
+
from bson import ObjectId
|
7 |
+
import re
|
8 |
+
|
9 |
+
|
10 |
+
def get_collection_users(request: Request):
|
11 |
+
test = request.app.database["user_records"]
|
12 |
+
print(type(test))
|
13 |
+
return test
|
14 |
+
|
15 |
+
|
16 |
+
def create_user(request: Request, user: User = Body(...)):
|
17 |
+
user = jsonable_encoder(user)
|
18 |
+
new_user = get_collection_users(request).insert_one(user)
|
19 |
+
created_user = get_collection_users(request).find_one({"_id": new_user.inserted_id})
|
20 |
+
print("NEW ID IS:.........", new_user.inserted_id)
|
21 |
+
return created_user
|
22 |
+
|
23 |
+
|
24 |
+
def list_users(request: Request, limit: int):
|
25 |
+
try:
|
26 |
+
users = list(get_collection_users(request).find(limit = limit))
|
27 |
+
return users
|
28 |
+
except:
|
29 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No users found!")
|
30 |
+
|
31 |
+
|
32 |
+
def find_user(request: Request, user_id: str):
|
33 |
+
if (user := get_collection_users(request).find_one({"user_id": user_id})):
|
34 |
+
return user
|
35 |
+
else:
|
36 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id {user_id} not found!")
|
37 |
+
|
38 |
+
|
39 |
+
def find_user_name(request: Request, name: str):
|
40 |
+
# search for name in lowercase
|
41 |
+
if (user := get_collection_users(request).find_one({"name": re.compile('^' + re.escape(name) + '$', re.IGNORECASE)})):
|
42 |
+
return user
|
43 |
+
else:
|
44 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with name {name} not found!")
|
45 |
+
|
46 |
+
|
47 |
+
def find_user_email(request: Request, email: str):
|
48 |
+
if (user := get_collection_users(request).find_one({"email": re.compile('^' + re.escape(email) + '$', re.IGNORECASE)})):
|
49 |
+
return user
|
50 |
+
else:
|
51 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with Email Address {email} not found!")
|
52 |
+
|
53 |
+
|
54 |
+
''' Update user record based on user object/json'''
|
55 |
+
def update_user(request: Request, user_id: str, user: UpdateUser):
|
56 |
+
try:
|
57 |
+
user = {k: v for k, v in user.model_dump().items() if v is not None}
|
58 |
+
if len(user) >= 1:
|
59 |
+
update_result = get_collection_users(request).update_one({"user_id": user_id}, {"$set": user})
|
60 |
+
|
61 |
+
if update_result.modified_count == 0:
|
62 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id: '{user_id}' not found and updated!")
|
63 |
+
|
64 |
+
if (existing_users := get_collection_users(request).find_one({"user_id": user_id})) is not None:
|
65 |
+
return existing_users
|
66 |
+
except:
|
67 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id: '{user_id}' not found and updated!")
|
68 |
+
|
69 |
+
|
70 |
+
def delete_user(request: Request, user_id: str):
|
71 |
+
try:
|
72 |
+
deleted_user = get_collection_users(request).delete_one({"user_id": user_id})
|
73 |
+
|
74 |
+
if deleted_user.deleted_count == 1:
|
75 |
+
return f"User with user_id {user_id} deleted sucessfully"
|
76 |
+
except:
|
77 |
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id {user_id} not found!")
|
backend/pcmToWav.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wave
|
2 |
+
import os
|
3 |
+
|
4 |
+
|
5 |
+
basePath = os.path.expanduser("~/Desktop/")
|
6 |
+
|
7 |
+
|
8 |
+
def convert_pcm_to_wav():
|
9 |
+
# PCM file parameters (should match the parameters used to create the PCM file)
|
10 |
+
pcm_file = basePath + 'output.pcm'
|
11 |
+
wav_file = 'pcmconverted.wav'
|
12 |
+
sample_rate = 16000 # Example: 16000 Hz
|
13 |
+
channels = 1 # Example: 2 for stereo
|
14 |
+
sample_width = 2 # Example: 2 bytes (16 bits), change if your PCM format is different
|
15 |
+
|
16 |
+
# Read the PCM file and write to a WAV file
|
17 |
+
with open(pcm_file, 'rb') as pcmfile:
|
18 |
+
pcm_data = pcmfile.read()
|
19 |
+
|
20 |
+
with wave.open(wav_file, 'wb') as wavfile:
|
21 |
+
wavfile.setnchannels(channels)
|
22 |
+
wavfile.setsampwidth(sample_width)
|
23 |
+
wavfile.setframerate(sample_rate)
|
24 |
+
wavfile.writeframes(pcm_data)
|
25 |
+
|
26 |
+
convert_pcm_to_wav()
|
27 |
+
|
28 |
+
# def generateCaptions(filepath):
|
29 |
+
|
30 |
+
# ! This might be redundant due to seamless-streaming
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
print(f"Converted {pcm_file} to {wav_file}")
|
backend/preprocess_wav.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import soundfile
|
2 |
+
import io
|
3 |
+
from typing import Any, Tuple, Union, Optional
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
|
7 |
+
def preprocess_wav(data: Any, incoming_sample_rate) -> Tuple[np.ndarray, int]:
|
8 |
+
segment, sample_rate = soundfile.read(
|
9 |
+
io.BytesIO(data),
|
10 |
+
dtype="float32",
|
11 |
+
always_2d=True,
|
12 |
+
frames=-1,
|
13 |
+
start=0,
|
14 |
+
format="RAW",
|
15 |
+
subtype="PCM_16",
|
16 |
+
samplerate=incoming_sample_rate,
|
17 |
+
channels=1,
|
18 |
+
)
|
19 |
+
return segment, sample_rate
|
20 |
+
|
21 |
+
def convert_waveform(
|
22 |
+
waveform: Union[np.ndarray, torch.Tensor],
|
23 |
+
sample_rate: int,
|
24 |
+
normalize_volume: bool = False,
|
25 |
+
to_mono: bool = False,
|
26 |
+
to_sample_rate: Optional[int] = None,
|
27 |
+
) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
|
28 |
+
"""convert a waveform:
|
29 |
+
- to a target sample rate
|
30 |
+
- from multi-channel to mono channel
|
31 |
+
- volume normalization
|
32 |
+
|
33 |
+
Args:
|
34 |
+
waveform (numpy.ndarray or torch.Tensor): 2D original waveform
|
35 |
+
(channels x length)
|
36 |
+
sample_rate (int): original sample rate
|
37 |
+
normalize_volume (bool): perform volume normalization
|
38 |
+
to_mono (bool): convert to mono channel if having multiple channels
|
39 |
+
to_sample_rate (Optional[int]): target sample rate
|
40 |
+
Returns:
|
41 |
+
waveform (numpy.ndarray): converted 2D waveform (channels x length)
|
42 |
+
sample_rate (float): target sample rate
|
43 |
+
"""
|
44 |
+
try:
|
45 |
+
import torchaudio.sox_effects as ta_sox
|
46 |
+
except ImportError:
|
47 |
+
raise ImportError("Please install torchaudio: pip install torchaudio")
|
48 |
+
|
49 |
+
effects = []
|
50 |
+
if normalize_volume:
|
51 |
+
effects.append(["gain", "-n"])
|
52 |
+
if to_sample_rate is not None and to_sample_rate != sample_rate:
|
53 |
+
effects.append(["rate", f"{to_sample_rate}"])
|
54 |
+
if to_mono and waveform.shape[0] > 1:
|
55 |
+
effects.append(["channels", "1"])
|
56 |
+
if len(effects) > 0:
|
57 |
+
is_np_input = isinstance(waveform, np.ndarray)
|
58 |
+
_waveform = torch.from_numpy(waveform) if is_np_input else waveform
|
59 |
+
converted, converted_sample_rate = ta_sox.apply_effects_tensor(
|
60 |
+
_waveform, sample_rate, effects
|
61 |
+
)
|
62 |
+
if is_np_input:
|
63 |
+
converted = converted.numpy()
|
64 |
+
return converted, converted_sample_rate
|
65 |
+
return waveform, sample_rate
|
backend/requirements.txt
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
# seamless_communication
|
4 |
+
git+https://github.com/facebookresearch/seamless_communication.git
|
5 |
+
# ./whl/seamless_communication-1.0.0-py3-none-any.whl
|
6 |
+
Flask==2.1.3
|
7 |
+
Flask_Sockets==0.2.1
|
8 |
+
g2p_en==2.1.0
|
9 |
+
gevent==22.10.2
|
10 |
+
gevent_websocket==0.10.1
|
11 |
+
librosa==0.9.2
|
12 |
+
numpy==1.24.4
|
13 |
+
openai_whisper==20230124
|
14 |
+
protobuf==4.24.2
|
15 |
+
psola==0.0.1
|
16 |
+
pydub==0.25.1
|
17 |
+
silero==0.4.1
|
18 |
+
soundfile==0.11.0
|
19 |
+
stable_ts==1.4.0
|
20 |
+
# to be installed by user for desired PyTorch version
|
21 |
+
torch==2.1.2
|
22 |
+
torchaudio==2.1.2
|
23 |
+
# simuleval # to be installed by seamless_communication
|
24 |
+
Werkzeug==2.0.3
|
25 |
+
whisper==1.1.10
|
26 |
+
colorlog==6.7.0
|
27 |
+
python-socketio==5.9.0
|
28 |
+
uvicorn[standard]==0.23.2
|
29 |
+
parallel-wavegan==0.5.5
|
30 |
+
python-jose[cryptography]==3.3.0
|
31 |
+
starlette==0.32.0.post1
|
32 |
+
hf_transfer==0.1.4
|
33 |
+
huggingface_hub==0.19.4
|
34 |
+
python-socketio==5.9.0
|
35 |
+
contextlib2==21.6.0
|
36 |
+
pymongo==4.6.2
|
backend/routes/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from.routing import router
|
backend/routes/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (235 Bytes). View file
|
|
backend/routes/__pycache__/routing.cpython-310.pyc
ADDED
Binary file (375 Bytes). View file
|
|
backend/routes/routing.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
import sys
|
3 |
+
# sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
|
4 |
+
from mongodb.endpoints import users, calls
|
5 |
+
|
6 |
+
router = APIRouter()
|
7 |
+
router.include_router(calls.router)
|
8 |
+
router.include_router(users.router)
|
9 |
+
# router.include_router(transcripts.router)
|
backend/seamless/__init__.py
ADDED
File without changes
|
backend/seamless/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (163 Bytes). View file
|
|
backend/seamless/__pycache__/room.cpython-310.pyc
ADDED
Binary file (2.77 kB). View file
|
|
backend/seamless/__pycache__/simuleval_agent_directory.cpython-310.pyc
ADDED
Binary file (5.33 kB). View file
|
|
backend/seamless/__pycache__/simuleval_transcoder.cpython-310.pyc
ADDED
Binary file (14.1 kB). View file
|
|
backend/seamless/__pycache__/speech_and_text_output.cpython-310.pyc
ADDED
Binary file (668 Bytes). View file
|
|
backend/seamless/__pycache__/transcoder_helpers.cpython-310.pyc
ADDED
Binary file (872 Bytes). View file
|
|
backend/seamless/room.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import json
|
2 |
+
import uuid
|
3 |
+
|
4 |
+
|
5 |
+
class Room:
|
6 |
+
def __init__(self, room_id) -> None:
|
7 |
+
self.room_id = room_id
|
8 |
+
# members is a dict from client_id to Member
|
9 |
+
self.members = {}
|
10 |
+
|
11 |
+
# listeners and speakers are lists of client_id's
|
12 |
+
self.listeners = []
|
13 |
+
self.speakers = []
|
14 |
+
|
15 |
+
def __str__(self) -> str:
|
16 |
+
return f"Room {self.room_id} ({len(self.members)} member{'s' if len(self.members) == 1 else ''})"
|
17 |
+
|
18 |
+
def to_json(self):
|
19 |
+
varsResult = vars(self)
|
20 |
+
# Remember: result is just a shallow copy, so result.members === self.members
|
21 |
+
# Because of that, we need to jsonify self.members without writing over result.members,
|
22 |
+
# which we do here via dictionary unpacking (the ** operator)
|
23 |
+
result = {
|
24 |
+
**varsResult,
|
25 |
+
"members": {key: value.to_json() for (key, value) in self.members.items()},
|
26 |
+
"activeTranscoders": self.get_active_transcoders(),
|
27 |
+
}
|
28 |
+
|
29 |
+
return result
|
30 |
+
|
31 |
+
def get_active_connections(self):
|
32 |
+
return len(
|
33 |
+
[m for m in self.members.values() if m.connection_status == "connected"]
|
34 |
+
)
|
35 |
+
|
36 |
+
def get_active_transcoders(self):
|
37 |
+
return len([m for m in self.members.values() if m.transcoder is not None])
|
38 |
+
|
39 |
+
def get_room_status_dict(self):
|
40 |
+
return {
|
41 |
+
"activeConnections": self.get_active_connections(),
|
42 |
+
"activeTranscoders": self.get_active_transcoders(),
|
43 |
+
}
|
44 |
+
|
45 |
+
|
46 |
+
class Member:
|
47 |
+
def __init__(self, client_id, session_id, name) -> None:
|
48 |
+
self.client_id = client_id
|
49 |
+
self.session_id = session_id
|
50 |
+
self.name = name
|
51 |
+
self.connection_status = "connected"
|
52 |
+
self.transcoder = None
|
53 |
+
self.requested_output_type = None
|
54 |
+
self.transcoder_dynamic_config = None
|
55 |
+
|
56 |
+
def __str__(self) -> str:
|
57 |
+
return f"{self.name} (id: {self.client_id[:4]}...) ({self.connection_status})"
|
58 |
+
|
59 |
+
def to_json(self):
|
60 |
+
self_vars = vars(self)
|
61 |
+
return {
|
62 |
+
**self_vars,
|
63 |
+
"transcoder": self.transcoder is not None,
|
64 |
+
}
|
backend/seamless/simuleval_agent_directory.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Creates a directory in which to look up available agents
|
2 |
+
|
3 |
+
import os
|
4 |
+
from typing import List, Optional
|
5 |
+
from seamless.simuleval_transcoder import SimulevalTranscoder
|
6 |
+
import json
|
7 |
+
import logging
|
8 |
+
|
9 |
+
logger = logging.getLogger("socketio_server_pubsub")
|
10 |
+
|
11 |
+
# fmt: off
|
12 |
+
M4T_P0_LANGS = [
|
13 |
+
"eng",
|
14 |
+
"arb", "ben", "cat", "ces", "cmn", "cym", "dan",
|
15 |
+
"deu", "est", "fin", "fra", "hin", "ind", "ita",
|
16 |
+
"jpn", "kor", "mlt", "nld", "pes", "pol", "por",
|
17 |
+
"ron", "rus", "slk", "spa", "swe", "swh", "tel",
|
18 |
+
"tgl", "tha", "tur", "ukr", "urd", "uzn", "vie",
|
19 |
+
]
|
20 |
+
# fmt: on
|
21 |
+
|
22 |
+
|
23 |
+
class NoAvailableAgentException(Exception):
|
24 |
+
pass
|
25 |
+
|
26 |
+
|
27 |
+
class AgentWithInfo:
|
28 |
+
def __init__(
|
29 |
+
self,
|
30 |
+
agent,
|
31 |
+
name: str,
|
32 |
+
modalities: List[str],
|
33 |
+
target_langs: List[str],
|
34 |
+
# Supported dynamic params are defined in StreamingTypes.ts
|
35 |
+
dynamic_params: List[str] = [],
|
36 |
+
description="",
|
37 |
+
has_expressive: Optional[bool] = None,
|
38 |
+
):
|
39 |
+
self.agent = agent
|
40 |
+
self.has_expressive = has_expressive
|
41 |
+
self.name = name
|
42 |
+
self.description = description
|
43 |
+
self.modalities = modalities
|
44 |
+
self.target_langs = target_langs
|
45 |
+
self.dynamic_params = dynamic_params
|
46 |
+
|
47 |
+
def get_capabilities_for_json(self):
|
48 |
+
return {
|
49 |
+
"name": self.name,
|
50 |
+
"description": self.description,
|
51 |
+
"modalities": self.modalities,
|
52 |
+
"targetLangs": self.target_langs,
|
53 |
+
"dynamicParams": self.dynamic_params,
|
54 |
+
}
|
55 |
+
|
56 |
+
@classmethod
|
57 |
+
def load_from_json(cls, config: str):
|
58 |
+
"""
|
59 |
+
Takes in JSON array of models to load in, e.g.
|
60 |
+
[{"name": "s2s_m4t_emma-unity2_multidomain_v0.1", "description": "M4T model that supports simultaneous S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]},
|
61 |
+
{"name": "s2s_m4t_expr-emma_v0.1", "description": "ES-EN expressive model that supports S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]}]
|
62 |
+
"""
|
63 |
+
configs = json.loads(config)
|
64 |
+
agents = []
|
65 |
+
for config in configs:
|
66 |
+
agent = SimulevalTranscoder.build_agent(config["name"])
|
67 |
+
agents.append(
|
68 |
+
AgentWithInfo(
|
69 |
+
agent=agent,
|
70 |
+
name=config["name"],
|
71 |
+
modalities=config["modalities"],
|
72 |
+
target_langs=config["targetLangs"],
|
73 |
+
)
|
74 |
+
)
|
75 |
+
return agents
|
76 |
+
|
77 |
+
|
78 |
+
class SimulevalAgentDirectory:
|
79 |
+
# Available models. These are the directories where the models can be found, and also serve as an ID for the model.
|
80 |
+
seamless_streaming_agent = "SeamlessStreaming"
|
81 |
+
seamless_agent = "Seamless"
|
82 |
+
|
83 |
+
def __init__(self):
|
84 |
+
self.agents = []
|
85 |
+
self.did_build_and_add_agents = False
|
86 |
+
|
87 |
+
def add_agent(self, agent: AgentWithInfo):
|
88 |
+
self.agents.append(agent)
|
89 |
+
|
90 |
+
def build_agent_if_available(self, model_id, config_name=None):
|
91 |
+
agent = None
|
92 |
+
try:
|
93 |
+
if config_name is not None:
|
94 |
+
agent = SimulevalTranscoder.build_agent(
|
95 |
+
model_id,
|
96 |
+
config_name=config_name,
|
97 |
+
)
|
98 |
+
else:
|
99 |
+
agent = SimulevalTranscoder.build_agent(
|
100 |
+
model_id,
|
101 |
+
)
|
102 |
+
except Exception as e:
|
103 |
+
from fairseq2.assets.error import AssetError
|
104 |
+
logger.warning("Failed to build agent %s: %s" % (model_id, e))
|
105 |
+
if isinstance(e, AssetError):
|
106 |
+
logger.warning(
|
107 |
+
"Please download gated assets and set `gated_model_dir` in the config"
|
108 |
+
)
|
109 |
+
raise e
|
110 |
+
|
111 |
+
return agent
|
112 |
+
|
113 |
+
def build_and_add_agents(self, models_override=None):
|
114 |
+
if self.did_build_and_add_agents:
|
115 |
+
return
|
116 |
+
|
117 |
+
if models_override is not None:
|
118 |
+
agent_infos = AgentWithInfo.load_from_json(models_override)
|
119 |
+
for agent_info in agent_infos:
|
120 |
+
self.add_agent(agent_info)
|
121 |
+
else:
|
122 |
+
s2s_agent = None
|
123 |
+
if os.environ.get("USE_EXPRESSIVE_MODEL", "0") == "1":
|
124 |
+
logger.info("Building expressive model...")
|
125 |
+
s2s_agent = self.build_agent_if_available(
|
126 |
+
SimulevalAgentDirectory.seamless_agent,
|
127 |
+
config_name="vad_s2st_sc_24khz_main.yaml",
|
128 |
+
)
|
129 |
+
has_expressive = True
|
130 |
+
else:
|
131 |
+
logger.info("Building non-expressive model...")
|
132 |
+
s2s_agent = self.build_agent_if_available(
|
133 |
+
SimulevalAgentDirectory.seamless_streaming_agent,
|
134 |
+
config_name="vad_s2st_sc_main.yaml",
|
135 |
+
)
|
136 |
+
has_expressive = False
|
137 |
+
|
138 |
+
if s2s_agent:
|
139 |
+
self.add_agent(
|
140 |
+
AgentWithInfo(
|
141 |
+
agent=s2s_agent,
|
142 |
+
name=SimulevalAgentDirectory.seamless_streaming_agent,
|
143 |
+
modalities=["s2t", "s2s"],
|
144 |
+
target_langs=M4T_P0_LANGS,
|
145 |
+
dynamic_params=["expressive"],
|
146 |
+
description="multilingual expressive model that supports S2S and S2T",
|
147 |
+
has_expressive=has_expressive,
|
148 |
+
)
|
149 |
+
)
|
150 |
+
|
151 |
+
if len(self.agents) == 0:
|
152 |
+
logger.error(
|
153 |
+
"No agents were loaded. This likely means you are missing the actual model files specified in simuleval_agent_directory."
|
154 |
+
)
|
155 |
+
|
156 |
+
self.did_build_and_add_agents = True
|
157 |
+
|
158 |
+
def get_agent(self, name):
|
159 |
+
for agent in self.agents:
|
160 |
+
if agent.name == name:
|
161 |
+
return agent
|
162 |
+
return None
|
163 |
+
|
164 |
+
def get_agent_or_throw(self, name):
|
165 |
+
agent = self.get_agent(name)
|
166 |
+
if agent is None:
|
167 |
+
raise NoAvailableAgentException("No agent found with name= %s" % (name))
|
168 |
+
return agent
|
169 |
+
|
170 |
+
def get_agents_capabilities_list_for_json(self):
|
171 |
+
return [agent.get_capabilities_for_json() for agent in self.agents]
|
backend/seamless/simuleval_transcoder.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from simuleval.utils.agent import build_system_from_dir
|
2 |
+
from typing import Any, List, Optional, Tuple, Union
|
3 |
+
import numpy as np
|
4 |
+
import soundfile
|
5 |
+
import io
|
6 |
+
import asyncio
|
7 |
+
from simuleval.agents.pipeline import TreeAgentPipeline
|
8 |
+
from simuleval.agents.states import AgentStates
|
9 |
+
from simuleval.data.segments import Segment, EmptySegment, SpeechSegment
|
10 |
+
import threading
|
11 |
+
import math
|
12 |
+
import logging
|
13 |
+
import sys
|
14 |
+
from pathlib import Path
|
15 |
+
import time
|
16 |
+
from g2p_en import G2p
|
17 |
+
import torch
|
18 |
+
import traceback
|
19 |
+
import time
|
20 |
+
import random
|
21 |
+
import colorlog
|
22 |
+
|
23 |
+
from .speech_and_text_output import SpeechAndTextOutput
|
24 |
+
|
25 |
+
MODEL_SAMPLE_RATE = 16_000
|
26 |
+
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
# logger.propagate = False
|
29 |
+
handler = colorlog.StreamHandler(stream=sys.stdout)
|
30 |
+
formatter = colorlog.ColoredFormatter(
|
31 |
+
"%(log_color)s[%(asctime)s][%(levelname)s][%(module)s]:%(reset)s %(message)s",
|
32 |
+
reset=True,
|
33 |
+
log_colors={
|
34 |
+
"DEBUG": "cyan",
|
35 |
+
"INFO": "green",
|
36 |
+
"WARNING": "yellow",
|
37 |
+
"ERROR": "red",
|
38 |
+
"CRITICAL": "red,bg_white",
|
39 |
+
},
|
40 |
+
)
|
41 |
+
handler.setFormatter(formatter)
|
42 |
+
logger.addHandler(handler)
|
43 |
+
logger.setLevel(logging.WARNING)
|
44 |
+
|
45 |
+
|
46 |
+
class OutputSegments:
|
47 |
+
def __init__(self, segments: Union[List[Segment], Segment]):
|
48 |
+
if isinstance(segments, Segment):
|
49 |
+
segments = [segments]
|
50 |
+
self.segments: List[Segment] = [s for s in segments]
|
51 |
+
|
52 |
+
@property
|
53 |
+
def is_empty(self):
|
54 |
+
return all(segment.is_empty for segment in self.segments)
|
55 |
+
|
56 |
+
@property
|
57 |
+
def finished(self):
|
58 |
+
return all(segment.finished for segment in self.segments)
|
59 |
+
|
60 |
+
def compute_length(self, g2p):
|
61 |
+
lengths = []
|
62 |
+
for segment in self.segments:
|
63 |
+
if segment.data_type == "text":
|
64 |
+
lengths.append(len([x for x in g2p(segment.content) if x != " "]))
|
65 |
+
elif segment.data_type == "speech":
|
66 |
+
lengths.append(len(segment.content) / MODEL_SAMPLE_RATE)
|
67 |
+
elif isinstance(segment, EmptySegment):
|
68 |
+
continue
|
69 |
+
else:
|
70 |
+
logger.warning(
|
71 |
+
f"Unexpected data_type: {segment.data_type} not in 'speech', 'text'"
|
72 |
+
)
|
73 |
+
return max(lengths)
|
74 |
+
|
75 |
+
@classmethod
|
76 |
+
def join_output_buffer(
|
77 |
+
cls, buffer: List[List[Segment]], output: SpeechAndTextOutput
|
78 |
+
):
|
79 |
+
num_segments = len(buffer[0])
|
80 |
+
for i in range(num_segments):
|
81 |
+
segment_list = [
|
82 |
+
buffer[j][i]
|
83 |
+
for j in range(len(buffer))
|
84 |
+
if buffer[j][i].data_type is not None
|
85 |
+
]
|
86 |
+
if len(segment_list) == 0:
|
87 |
+
continue
|
88 |
+
if len(set(segment.data_type for segment in segment_list)) != 1:
|
89 |
+
logger.warning(
|
90 |
+
f"Data type mismatch at {i}: {set(segment.data_type for segment in segment_list)}"
|
91 |
+
)
|
92 |
+
continue
|
93 |
+
data_type = segment_list[0].data_type
|
94 |
+
if data_type == "text":
|
95 |
+
if output.text is not None:
|
96 |
+
logger.warning("Multiple text outputs, overwriting!")
|
97 |
+
output.text = " ".join([segment.content for segment in segment_list])
|
98 |
+
elif data_type == "speech":
|
99 |
+
if output.speech_samples is not None:
|
100 |
+
logger.warning("Multiple speech outputs, overwriting!")
|
101 |
+
speech_out = []
|
102 |
+
for segment in segment_list:
|
103 |
+
speech_out += segment.content
|
104 |
+
output.speech_samples = speech_out
|
105 |
+
output.speech_sample_rate = segment.sample_rate
|
106 |
+
elif isinstance(segment_list[0], EmptySegment):
|
107 |
+
continue
|
108 |
+
else:
|
109 |
+
logger.warning(
|
110 |
+
f"Invalid output buffer data type: {data_type}, expected 'speech' or 'text"
|
111 |
+
)
|
112 |
+
|
113 |
+
return output
|
114 |
+
|
115 |
+
def __repr__(self) -> str:
|
116 |
+
repr_str = str(self.segments)
|
117 |
+
return f"{self.__class__.__name__}(\n\t{repr_str}\n)"
|
118 |
+
|
119 |
+
|
120 |
+
class SimulevalTranscoder:
|
121 |
+
def __init__(self, agent, sample_rate, debug, buffer_limit):
|
122 |
+
self.agent = agent.agent
|
123 |
+
self.has_expressive = agent.has_expressive
|
124 |
+
self.input_queue = asyncio.Queue()
|
125 |
+
self.output_queue = asyncio.Queue()
|
126 |
+
self.states = self.agent.build_states()
|
127 |
+
if debug:
|
128 |
+
self.get_states_root().debug = True
|
129 |
+
self.incoming_sample_rate = sample_rate
|
130 |
+
self.close = False
|
131 |
+
self.g2p = G2p()
|
132 |
+
|
133 |
+
# buffer all outgoing translations within this amount of time
|
134 |
+
self.output_buffer_idle_ms = 5000
|
135 |
+
self.output_buffer_size_limit = (
|
136 |
+
buffer_limit # phonemes for text, seconds for speech
|
137 |
+
)
|
138 |
+
self.output_buffer_cur_size = 0
|
139 |
+
self.output_buffer: List[List[Segment]] = []
|
140 |
+
self.speech_output_sample_rate = None
|
141 |
+
|
142 |
+
self.last_output_ts = time.time() * 1000
|
143 |
+
self.timeout_ms = (
|
144 |
+
30000 # close the transcoder thread after this amount of silence
|
145 |
+
)
|
146 |
+
self.first_input_ts = None
|
147 |
+
self.first_output_ts = None
|
148 |
+
self.debug = debug
|
149 |
+
self.debug_ts = f"{time.time()}_{random.randint(1000, 9999)}"
|
150 |
+
if self.debug:
|
151 |
+
debug_folder = Path(__file__).resolve().parent.parent / "debug"
|
152 |
+
self.test_incoming_wav = soundfile.SoundFile(
|
153 |
+
debug_folder / f"{self.debug_ts}_test_incoming.wav",
|
154 |
+
mode="w+",
|
155 |
+
format="WAV",
|
156 |
+
subtype="PCM_16",
|
157 |
+
samplerate=self.incoming_sample_rate,
|
158 |
+
channels=1,
|
159 |
+
)
|
160 |
+
self.get_states_root().test_input_segments_wav = soundfile.SoundFile(
|
161 |
+
debug_folder / f"{self.debug_ts}_test_input_segments.wav",
|
162 |
+
mode="w+",
|
163 |
+
format="WAV",
|
164 |
+
samplerate=MODEL_SAMPLE_RATE,
|
165 |
+
channels=1,
|
166 |
+
)
|
167 |
+
|
168 |
+
def get_states_root(self) -> AgentStates:
|
169 |
+
if isinstance(self.agent, TreeAgentPipeline):
|
170 |
+
# self.states is a dict
|
171 |
+
return self.states[self.agent.source_module]
|
172 |
+
else:
|
173 |
+
# self.states is a list
|
174 |
+
return self.states[0]
|
175 |
+
|
176 |
+
def reset_states(self):
|
177 |
+
if isinstance(self.agent, TreeAgentPipeline):
|
178 |
+
states_iter = self.states.values()
|
179 |
+
else:
|
180 |
+
states_iter = self.states
|
181 |
+
for state in states_iter:
|
182 |
+
state.reset()
|
183 |
+
|
184 |
+
def debug_log(self, *args):
|
185 |
+
if self.debug:
|
186 |
+
logger.info(*args)
|
187 |
+
|
188 |
+
@classmethod
|
189 |
+
def build_agent(cls, model_path, config_name):
|
190 |
+
logger.info(f"Building simuleval agent: {model_path}, {config_name}")
|
191 |
+
agent = build_system_from_dir(
|
192 |
+
Path(__file__).resolve().parent.parent / f"models/{model_path}",
|
193 |
+
config_name=config_name,
|
194 |
+
)
|
195 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
196 |
+
agent.to(device, fp16=True)
|
197 |
+
logger.info(
|
198 |
+
f"Successfully built simuleval agent {model_path} on device {device}"
|
199 |
+
)
|
200 |
+
|
201 |
+
return agent
|
202 |
+
|
203 |
+
def process_incoming_bytes(self, incoming_bytes, dynamic_config):
|
204 |
+
# TODO: We probably want to do some validation on dynamic_config to ensure it has what we needs
|
205 |
+
segment, sr = self._preprocess_wav(incoming_bytes)
|
206 |
+
segment = SpeechSegment(
|
207 |
+
content=segment,
|
208 |
+
sample_rate=sr,
|
209 |
+
tgt_lang=dynamic_config.get("targetLanguage"),
|
210 |
+
config=dynamic_config,
|
211 |
+
)
|
212 |
+
if dynamic_config.get("expressive") is True and self.has_expressive is False:
|
213 |
+
logger.warning(
|
214 |
+
"Passing 'expressive' but the agent does not support expressive output!"
|
215 |
+
)
|
216 |
+
# # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
|
217 |
+
self.input_queue.put_nowait(segment)
|
218 |
+
|
219 |
+
def get_input_segment(self):
|
220 |
+
if self.input_queue.empty():
|
221 |
+
return None
|
222 |
+
chunk = self.input_queue.get_nowait()
|
223 |
+
self.input_queue.task_done()
|
224 |
+
return chunk
|
225 |
+
|
226 |
+
def convert_waveform(
|
227 |
+
self,
|
228 |
+
waveform: Union[np.ndarray, torch.Tensor],
|
229 |
+
sample_rate: int,
|
230 |
+
normalize_volume: bool = False,
|
231 |
+
to_mono: bool = False,
|
232 |
+
to_sample_rate: Optional[int] = None,
|
233 |
+
) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
|
234 |
+
"""convert a waveform:
|
235 |
+
- to a target sample rate
|
236 |
+
- from multi-channel to mono channel
|
237 |
+
- volume normalization
|
238 |
+
|
239 |
+
Args:
|
240 |
+
waveform (numpy.ndarray or torch.Tensor): 2D original waveform
|
241 |
+
(channels x length)
|
242 |
+
sample_rate (int): original sample rate
|
243 |
+
normalize_volume (bool): perform volume normalization
|
244 |
+
to_mono (bool): convert to mono channel if having multiple channels
|
245 |
+
to_sample_rate (Optional[int]): target sample rate
|
246 |
+
Returns:
|
247 |
+
waveform (numpy.ndarray): converted 2D waveform (channels x length)
|
248 |
+
sample_rate (float): target sample rate
|
249 |
+
"""
|
250 |
+
try:
|
251 |
+
import torchaudio.sox_effects as ta_sox
|
252 |
+
except ImportError:
|
253 |
+
raise ImportError("Please install torchaudio: pip install torchaudio")
|
254 |
+
|
255 |
+
effects = []
|
256 |
+
if normalize_volume:
|
257 |
+
effects.append(["gain", "-n"])
|
258 |
+
if to_sample_rate is not None and to_sample_rate != sample_rate:
|
259 |
+
effects.append(["rate", f"{to_sample_rate}"])
|
260 |
+
if to_mono and waveform.shape[0] > 1:
|
261 |
+
effects.append(["channels", "1"])
|
262 |
+
if len(effects) > 0:
|
263 |
+
is_np_input = isinstance(waveform, np.ndarray)
|
264 |
+
_waveform = torch.from_numpy(waveform) if is_np_input else waveform
|
265 |
+
converted, converted_sample_rate = ta_sox.apply_effects_tensor(
|
266 |
+
_waveform, sample_rate, effects
|
267 |
+
)
|
268 |
+
if is_np_input:
|
269 |
+
converted = converted.numpy()
|
270 |
+
return converted, converted_sample_rate
|
271 |
+
return waveform, sample_rate
|
272 |
+
|
273 |
+
def _preprocess_wav(self, data: Any) -> Tuple[np.ndarray, int]:
|
274 |
+
segment, sample_rate = soundfile.read(
|
275 |
+
io.BytesIO(data),
|
276 |
+
dtype="float32",
|
277 |
+
always_2d=True,
|
278 |
+
frames=-1,
|
279 |
+
start=0,
|
280 |
+
format="RAW",
|
281 |
+
subtype="PCM_16",
|
282 |
+
samplerate=self.incoming_sample_rate,
|
283 |
+
channels=1,
|
284 |
+
)
|
285 |
+
if self.debug:
|
286 |
+
self.test_incoming_wav.seek(0, soundfile.SEEK_END)
|
287 |
+
self.test_incoming_wav.write(segment)
|
288 |
+
|
289 |
+
segment = segment.T
|
290 |
+
segment, new_sample_rate = self.convert_waveform(
|
291 |
+
segment,
|
292 |
+
sample_rate,
|
293 |
+
normalize_volume=False,
|
294 |
+
to_mono=True,
|
295 |
+
to_sample_rate=MODEL_SAMPLE_RATE,
|
296 |
+
)
|
297 |
+
|
298 |
+
assert MODEL_SAMPLE_RATE == new_sample_rate
|
299 |
+
segment = segment.squeeze(axis=0)
|
300 |
+
return segment, new_sample_rate
|
301 |
+
|
302 |
+
def process_pipeline_impl(self, input_segment):
|
303 |
+
try:
|
304 |
+
with torch.no_grad():
|
305 |
+
output_segment = OutputSegments(
|
306 |
+
self.agent.pushpop(input_segment, self.states)
|
307 |
+
)
|
308 |
+
if (
|
309 |
+
self.get_states_root().first_input_ts is not None
|
310 |
+
and self.first_input_ts is None
|
311 |
+
):
|
312 |
+
# TODO: this is hacky
|
313 |
+
self.first_input_ts = self.get_states_root().first_input_ts
|
314 |
+
|
315 |
+
if not output_segment.is_empty:
|
316 |
+
self.output_queue.put_nowait(output_segment)
|
317 |
+
|
318 |
+
if output_segment.finished:
|
319 |
+
self.debug_log("OUTPUT SEGMENT IS FINISHED. Resetting states.")
|
320 |
+
|
321 |
+
self.reset_states()
|
322 |
+
|
323 |
+
if self.debug:
|
324 |
+
# when we rebuild states, this value is reset to whatever
|
325 |
+
# is in the system dir config, which defaults debug=False.
|
326 |
+
self.get_states_root().debug = True
|
327 |
+
except Exception as e:
|
328 |
+
logger.error(f"Got exception while processing pipeline: {e}")
|
329 |
+
traceback.print_exc()
|
330 |
+
return input_segment
|
331 |
+
|
332 |
+
def process_pipeline_loop(self):
|
333 |
+
if self.close:
|
334 |
+
return # closes the thread
|
335 |
+
|
336 |
+
self.debug_log("processing_pipeline")
|
337 |
+
while not self.close:
|
338 |
+
input_segment = self.get_input_segment()
|
339 |
+
if input_segment is None:
|
340 |
+
if self.get_states_root().is_fresh_state: # TODO: this is hacky
|
341 |
+
time.sleep(0.3)
|
342 |
+
else:
|
343 |
+
time.sleep(0.03)
|
344 |
+
continue
|
345 |
+
self.process_pipeline_impl(input_segment)
|
346 |
+
self.debug_log("finished processing_pipeline")
|
347 |
+
|
348 |
+
def process_pipeline_once(self):
|
349 |
+
if self.close:
|
350 |
+
return
|
351 |
+
|
352 |
+
self.debug_log("processing pipeline once")
|
353 |
+
input_segment = self.get_input_segment()
|
354 |
+
if input_segment is None:
|
355 |
+
return
|
356 |
+
self.process_pipeline_impl(input_segment)
|
357 |
+
self.debug_log("finished processing_pipeline_once")
|
358 |
+
|
359 |
+
def get_output_segment(self):
|
360 |
+
if self.output_queue.empty():
|
361 |
+
return None
|
362 |
+
|
363 |
+
output_chunk = self.output_queue.get_nowait()
|
364 |
+
self.output_queue.task_done()
|
365 |
+
return output_chunk
|
366 |
+
|
367 |
+
def start(self):
|
368 |
+
self.debug_log("starting transcoder in a thread")
|
369 |
+
threading.Thread(target=self.process_pipeline_loop).start()
|
370 |
+
|
371 |
+
def first_translation_time(self):
|
372 |
+
return round((self.first_output_ts - self.first_input_ts) / 1000, 2)
|
373 |
+
|
374 |
+
def get_buffered_output(self) -> SpeechAndTextOutput:
|
375 |
+
now = time.time() * 1000
|
376 |
+
self.debug_log(f"get_buffered_output queue size: {self.output_queue.qsize()}")
|
377 |
+
while not self.output_queue.empty():
|
378 |
+
tmp_out = self.get_output_segment()
|
379 |
+
if tmp_out and tmp_out.compute_length(self.g2p) > 0:
|
380 |
+
if len(self.output_buffer) == 0:
|
381 |
+
self.last_output_ts = now
|
382 |
+
self._populate_output_buffer(tmp_out)
|
383 |
+
self._increment_output_buffer_size(tmp_out)
|
384 |
+
|
385 |
+
if tmp_out.finished:
|
386 |
+
self.debug_log("tmp_out.finished")
|
387 |
+
res = self._gather_output_buffer_data(final=True)
|
388 |
+
self.debug_log(f"gathered output data: {res}")
|
389 |
+
self.output_buffer = []
|
390 |
+
self.increment_output_buffer_size = 0
|
391 |
+
self.last_output_ts = now
|
392 |
+
self.first_output_ts = now
|
393 |
+
return res
|
394 |
+
else:
|
395 |
+
self.debug_log("tmp_out.compute_length is not > 0")
|
396 |
+
|
397 |
+
if len(self.output_buffer) > 0 and (
|
398 |
+
now - self.last_output_ts >= self.output_buffer_idle_ms
|
399 |
+
or self.output_buffer_cur_size >= self.output_buffer_size_limit
|
400 |
+
):
|
401 |
+
self.debug_log(
|
402 |
+
"[get_buffered_output] output_buffer is not empty. getting res to return."
|
403 |
+
)
|
404 |
+
self.last_output_ts = now
|
405 |
+
res = self._gather_output_buffer_data(final=False)
|
406 |
+
self.debug_log(f"gathered output data: {res}")
|
407 |
+
self.output_buffer = []
|
408 |
+
self.output_buffer_phoneme_count = 0
|
409 |
+
self.first_output_ts = now
|
410 |
+
return res
|
411 |
+
else:
|
412 |
+
self.debug_log("[get_buffered_output] output_buffer is empty...")
|
413 |
+
return None
|
414 |
+
|
415 |
+
def _gather_output_buffer_data(self, final):
|
416 |
+
output = SpeechAndTextOutput()
|
417 |
+
output.final = final
|
418 |
+
output = OutputSegments.join_output_buffer(self.output_buffer, output)
|
419 |
+
return output
|
420 |
+
|
421 |
+
def _increment_output_buffer_size(self, segment: OutputSegments):
|
422 |
+
self.output_buffer_cur_size += segment.compute_length(self.g2p)
|
423 |
+
|
424 |
+
def _populate_output_buffer(self, segment: OutputSegments):
|
425 |
+
self.output_buffer.append(segment.segments)
|
426 |
+
|
427 |
+
def _compute_phoneme_count(self, string: str) -> int:
|
428 |
+
return len([x for x in self.g2p(string) if x != " "])
|
backend/seamless/speech_and_text_output.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Provides a container to return both speech and text output from our model at the same time
|
2 |
+
|
3 |
+
|
4 |
+
class SpeechAndTextOutput:
|
5 |
+
def __init__(
|
6 |
+
self,
|
7 |
+
text: str = None,
|
8 |
+
speech_samples: list = None,
|
9 |
+
speech_sample_rate: float = None,
|
10 |
+
final: bool = False,
|
11 |
+
):
|
12 |
+
self.text = text
|
13 |
+
self.speech_samples = speech_samples
|
14 |
+
self.speech_sample_rate = speech_sample_rate
|
15 |
+
self.final = final
|
backend/seamless/transcoder_helpers.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
logger = logging.getLogger("socketio_server_pubsub")
|
4 |
+
|
5 |
+
|
6 |
+
def get_transcoder_output_events(transcoder) -> list:
|
7 |
+
speech_and_text_output = transcoder.get_buffered_output()
|
8 |
+
if speech_and_text_output is None:
|
9 |
+
logger.debug("No output from transcoder.get_buffered_output()")
|
10 |
+
return []
|
11 |
+
|
12 |
+
logger.debug(f"We DID get output from the transcoder! {speech_and_text_output}")
|
13 |
+
|
14 |
+
lat = None
|
15 |
+
|
16 |
+
events = []
|
17 |
+
|
18 |
+
if speech_and_text_output.speech_samples:
|
19 |
+
events.append(
|
20 |
+
{
|
21 |
+
"event": "translation_speech",
|
22 |
+
"payload": speech_and_text_output.speech_samples,
|
23 |
+
"sample_rate": speech_and_text_output.speech_sample_rate,
|
24 |
+
}
|
25 |
+
)
|
26 |
+
|
27 |
+
if speech_and_text_output.text:
|
28 |
+
events.append(
|
29 |
+
{
|
30 |
+
"event": "translation_text",
|
31 |
+
"payload": speech_and_text_output.text,
|
32 |
+
}
|
33 |
+
)
|
34 |
+
|
35 |
+
for e in events:
|
36 |
+
e["eos"] = speech_and_text_output.final
|
37 |
+
|
38 |
+
# if not latency_sent:
|
39 |
+
# lat = transcoder.first_translation_time()
|
40 |
+
# latency_sent = True
|
41 |
+
# to_send["latency"] = lat
|
42 |
+
|
43 |
+
return events
|
backend/seamless_utils.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# base seamless imports
|
3 |
+
# ---------------------------------
|
4 |
+
import io
|
5 |
+
import json
|
6 |
+
import matplotlib as mpl
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import mmap
|
9 |
+
import numpy as np
|
10 |
+
import soundfile
|
11 |
+
import torchaudio
|
12 |
+
import torch
|
13 |
+
from pydub import AudioSegment
|
14 |
+
# ---------------------------------
|
15 |
+
# seamless-streaming specific imports
|
16 |
+
# ---------------------------------
|
17 |
+
import math
|
18 |
+
from simuleval.data.segments import SpeechSegment, EmptySegment
|
19 |
+
from seamless_communication.streaming.agents.seamless_streaming_s2st import (
|
20 |
+
SeamlessStreamingS2STVADAgent,
|
21 |
+
)
|
22 |
+
|
23 |
+
from simuleval.utils.arguments import cli_argument_list
|
24 |
+
from simuleval import options
|
25 |
+
|
26 |
+
|
27 |
+
from typing import Union, List
|
28 |
+
from simuleval.data.segments import Segment, TextSegment
|
29 |
+
from simuleval.agents.pipeline import TreeAgentPipeline
|
30 |
+
from simuleval.agents.states import AgentStates
|
31 |
+
# ---------------------------------
|
32 |
+
# seamless setup
|
33 |
+
# source: https://colab.research.google.com/github/kauterry/seamless_communication/blob/main/Seamless_Tutorial.ipynb?
|
34 |
+
SAMPLE_RATE = 16000
|
35 |
+
|
36 |
+
# PM - THis class is used to simulate the audio frontend in the seamless streaming pipeline
|
37 |
+
# need to replace this with the actual audio frontend
|
38 |
+
# TODO: replacement class that takes in PCM-16 bytes and returns SpeechSegment
|
39 |
+
class AudioFrontEnd:
|
40 |
+
def __init__(self, wav_file, segment_size) -> None:
|
41 |
+
self.samples, self.sample_rate = soundfile.read(wav_file)
|
42 |
+
print(self.sample_rate, "sample rate")
|
43 |
+
assert self.sample_rate == SAMPLE_RATE
|
44 |
+
# print(len(self.samples), self.samples[:100])
|
45 |
+
self.samples = self.samples # .tolist()
|
46 |
+
self.segment_size = segment_size
|
47 |
+
self.step = 0
|
48 |
+
|
49 |
+
def send_segment(self):
|
50 |
+
"""
|
51 |
+
This is the front-end logic in simuleval instance.py
|
52 |
+
"""
|
53 |
+
|
54 |
+
num_samples = math.ceil(self.segment_size / 1000 * self.sample_rate)
|
55 |
+
|
56 |
+
if self.step < len(self.samples):
|
57 |
+
if self.step + num_samples >= len(self.samples):
|
58 |
+
samples = self.samples[self.step :]
|
59 |
+
is_finished = True
|
60 |
+
else:
|
61 |
+
samples = self.samples[self.step : self.step + num_samples]
|
62 |
+
is_finished = False
|
63 |
+
self.samples = self.samples[self.step:]
|
64 |
+
self.step = min(self.step + num_samples, len(self.samples))
|
65 |
+
segment = SpeechSegment(
|
66 |
+
content=samples,
|
67 |
+
sample_rate=self.sample_rate,
|
68 |
+
finished=is_finished,
|
69 |
+
)
|
70 |
+
else:
|
71 |
+
# Finish reading this audio
|
72 |
+
segment = EmptySegment(
|
73 |
+
finished=True,
|
74 |
+
)
|
75 |
+
self.step = 0
|
76 |
+
self.samples = []
|
77 |
+
return segment
|
78 |
+
|
79 |
+
# samples = self.samples[:num_samples]
|
80 |
+
# self.samples = self.samples[num_samples:]
|
81 |
+
# segment = SpeechSegment(
|
82 |
+
# content=samples,
|
83 |
+
# sample_rate=self.sample_rate,
|
84 |
+
# finished=False,
|
85 |
+
# )
|
86 |
+
|
87 |
+
|
88 |
+
def add_segments(self, wav):
|
89 |
+
new_samples, _ = soundfile.read(wav)
|
90 |
+
self.samples = np.concatenate((self.samples, new_samples))
|
91 |
+
|
92 |
+
|
93 |
+
class OutputSegments:
|
94 |
+
def __init__(self, segments: Union[List[Segment], Segment]):
|
95 |
+
if isinstance(segments, Segment):
|
96 |
+
segments = [segments]
|
97 |
+
self.segments: List[Segment] = [s for s in segments]
|
98 |
+
|
99 |
+
@property
|
100 |
+
def is_empty(self):
|
101 |
+
return all(segment.is_empty for segment in self.segments)
|
102 |
+
|
103 |
+
@property
|
104 |
+
def finished(self):
|
105 |
+
return all(segment.finished for segment in self.segments)
|
106 |
+
|
107 |
+
|
108 |
+
def get_audiosegment(samples, sr):
|
109 |
+
b = io.BytesIO()
|
110 |
+
soundfile.write(b, samples, samplerate=sr, format="wav")
|
111 |
+
b.seek(0)
|
112 |
+
return AudioSegment.from_file(b)
|
113 |
+
|
114 |
+
|
115 |
+
def reset_states(system, states):
|
116 |
+
if isinstance(system, TreeAgentPipeline):
|
117 |
+
states_iter = states.values()
|
118 |
+
else:
|
119 |
+
states_iter = states
|
120 |
+
for state in states_iter:
|
121 |
+
state.reset()
|
122 |
+
|
123 |
+
|
124 |
+
def get_states_root(system, states) -> AgentStates:
|
125 |
+
if isinstance(system, TreeAgentPipeline):
|
126 |
+
# self.states is a dict
|
127 |
+
return states[system.source_module]
|
128 |
+
else:
|
129 |
+
# self.states is a list
|
130 |
+
return system.states[0]
|
131 |
+
|
132 |
+
|
133 |
+
def build_streaming_system(model_configs, agent_class):
|
134 |
+
parser = options.general_parser()
|
135 |
+
parser.add_argument("-f", "--f", help="a dummy argument to fool ipython", default="1")
|
136 |
+
|
137 |
+
agent_class.add_args(parser)
|
138 |
+
args, _ = parser.parse_known_args(cli_argument_list(model_configs))
|
139 |
+
system = agent_class.from_args(args)
|
140 |
+
return system
|
141 |
+
|
142 |
+
|
143 |
+
def run_streaming_inference(system, audio_frontend, system_states, tgt_lang):
|
144 |
+
# NOTE: Here for visualization, we calculate delays offset from audio
|
145 |
+
# *BEFORE* VAD segmentation.
|
146 |
+
# In contrast for SimulEval evaluation, we assume audios are pre-segmented,
|
147 |
+
# and Average Lagging, End Offset metrics are based on those pre-segmented audios.
|
148 |
+
# Thus, delays here are *NOT* comparable to SimulEval per-segment delays
|
149 |
+
delays = {"s2st": [], "s2tt": []}
|
150 |
+
prediction_lists = {"s2st": [], "s2tt": []}
|
151 |
+
speech_durations = []
|
152 |
+
curr_delay = 0
|
153 |
+
target_sample_rate = None
|
154 |
+
|
155 |
+
while True:
|
156 |
+
input_segment = audio_frontend.send_segment()
|
157 |
+
input_segment.tgt_lang = tgt_lang
|
158 |
+
curr_delay += len(input_segment.content) / SAMPLE_RATE * 1000
|
159 |
+
if input_segment.finished:
|
160 |
+
# a hack, we expect a real stream to end with silence
|
161 |
+
get_states_root(system, system_states).source_finished = True
|
162 |
+
# Translation happens here
|
163 |
+
if isinstance(input_segment, EmptySegment):
|
164 |
+
return None, None, None, None
|
165 |
+
output_segments = OutputSegments(system.pushpop(input_segment, system_states))
|
166 |
+
if not output_segments.is_empty:
|
167 |
+
for segment in output_segments.segments:
|
168 |
+
# NOTE: another difference from SimulEval evaluation -
|
169 |
+
# delays are accumulated per-token
|
170 |
+
if isinstance(segment, SpeechSegment):
|
171 |
+
pred_duration = 1000 * len(segment.content) / segment.sample_rate
|
172 |
+
speech_durations.append(pred_duration)
|
173 |
+
delays["s2st"].append(curr_delay)
|
174 |
+
prediction_lists["s2st"].append(segment.content)
|
175 |
+
target_sample_rate = segment.sample_rate
|
176 |
+
elif isinstance(segment, TextSegment):
|
177 |
+
delays["s2tt"].append(curr_delay)
|
178 |
+
prediction_lists["s2tt"].append(segment.content)
|
179 |
+
print(curr_delay, segment.content)
|
180 |
+
if output_segments.finished:
|
181 |
+
reset_states(system, system_states)
|
182 |
+
if input_segment.finished:
|
183 |
+
# an assumption of SimulEval agents -
|
184 |
+
# once source_finished=True, generate until output translation is finished
|
185 |
+
break
|
186 |
+
return delays, prediction_lists, speech_durations, target_sample_rate
|
187 |
+
|
188 |
+
|
189 |
+
def get_s2st_delayed_targets(delays, target_sample_rate, prediction_lists, speech_durations):
|
190 |
+
# get calculate intervals + durations for s2st
|
191 |
+
intervals = []
|
192 |
+
|
193 |
+
start = prev_end = prediction_offset = delays["s2st"][0]
|
194 |
+
target_samples = [0.0] * int(target_sample_rate * prediction_offset / 1000)
|
195 |
+
|
196 |
+
for i, delay in enumerate(delays["s2st"]):
|
197 |
+
start = max(prev_end, delay)
|
198 |
+
|
199 |
+
if start > prev_end:
|
200 |
+
# Wait source speech, add discontinuity with silence
|
201 |
+
target_samples += [0.0] * int(
|
202 |
+
target_sample_rate * (start - prev_end) / 1000
|
203 |
+
)
|
204 |
+
|
205 |
+
target_samples += prediction_lists["s2st"][i]
|
206 |
+
duration = speech_durations[i]
|
207 |
+
prev_end = start + duration
|
208 |
+
intervals.append([start, duration])
|
209 |
+
return target_samples, intervals
|
210 |
+
|
backend/src/models/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (243 Bytes). View file
|
|
backend/src/testing/test.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
|
3 |
+
|
4 |
+
import utility
|
backend/src/utility/__pycache__/utility.cpython-310.pyc
ADDED
Binary file (202 Bytes). View file
|
|