benjolo commited on
Commit
d594983
1 Parent(s): 77be911

Adding backend/ folder to HF space

Browse files

Adding all source files in backend folder to space

Files changed (44) hide show
  1. backend/.DS_Store +0 -0
  2. backend/.env +1 -0
  3. backend/Client.py +74 -0
  4. backend/__pycache__/Client.cpython-310.pyc +0 -0
  5. backend/__pycache__/main.cpython-310.pyc +0 -0
  6. backend/backend.log +0 -0
  7. backend/main.py +338 -0
  8. backend/models/Seamless/vad_s2st_sc_24khz_main.yaml +25 -0
  9. backend/models/SeamlessStreaming/vad_s2st_sc_main.yaml +21 -0
  10. backend/mongodb/endpoints/__pycache__/calls.cpython-310.pyc +0 -0
  11. backend/mongodb/endpoints/__pycache__/users.cpython-310.pyc +0 -0
  12. backend/mongodb/endpoints/calls.py +53 -0
  13. backend/mongodb/endpoints/users.py +37 -0
  14. backend/mongodb/models/__pycache__/calls.cpython-310.pyc +0 -0
  15. backend/mongodb/models/__pycache__/users.cpython-310.pyc +0 -0
  16. backend/mongodb/models/calls.py +71 -0
  17. backend/mongodb/models/users.py +46 -0
  18. backend/mongodb/operations/__pycache__/calls.cpython-310.pyc +0 -0
  19. backend/mongodb/operations/__pycache__/users.cpython-310.pyc +0 -0
  20. backend/mongodb/operations/calls.py +128 -0
  21. backend/mongodb/operations/users.py +77 -0
  22. backend/pcmToWav.py +34 -0
  23. backend/preprocess_wav.py +65 -0
  24. backend/requirements.txt +36 -0
  25. backend/routes/__init__.py +1 -0
  26. backend/routes/__pycache__/__init__.cpython-310.pyc +0 -0
  27. backend/routes/__pycache__/routing.cpython-310.pyc +0 -0
  28. backend/routes/routing.py +9 -0
  29. backend/seamless/__init__.py +0 -0
  30. backend/seamless/__pycache__/__init__.cpython-310.pyc +0 -0
  31. backend/seamless/__pycache__/room.cpython-310.pyc +0 -0
  32. backend/seamless/__pycache__/simuleval_agent_directory.cpython-310.pyc +0 -0
  33. backend/seamless/__pycache__/simuleval_transcoder.cpython-310.pyc +0 -0
  34. backend/seamless/__pycache__/speech_and_text_output.cpython-310.pyc +0 -0
  35. backend/seamless/__pycache__/transcoder_helpers.cpython-310.pyc +0 -0
  36. backend/seamless/room.py +64 -0
  37. backend/seamless/simuleval_agent_directory.py +171 -0
  38. backend/seamless/simuleval_transcoder.py +428 -0
  39. backend/seamless/speech_and_text_output.py +15 -0
  40. backend/seamless/transcoder_helpers.py +43 -0
  41. backend/seamless_utils.py +210 -0
  42. backend/src/models/__pycache__/__init__.cpython-310.pyc +0 -0
  43. backend/src/testing/test.py +4 -0
  44. backend/src/utility/__pycache__/utility.cpython-310.pyc +0 -0
backend/.DS_Store ADDED
Binary file (6.15 kB). View file
 
backend/.env ADDED
@@ -0,0 +1 @@
 
 
1
+ MONGODB_URI=mongodb+srv://benjolo:[email protected]/?retryWrites=true&w=majority&appName=IT-Cluster1
backend/Client.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+ import wave
3
+ import os
4
+
5
+ import torchaudio
6
+ from vad import EnergyVAD
7
+ TARGET_SAMPLING_RATE = 16000
8
+
9
+ def create_frames(data: bytes, frame_duration: int) -> Tuple[bytes]:
10
+ frame_size = int(TARGET_SAMPLING_RATE * (frame_duration / 1000))
11
+ return (data[i:i + frame_size] for i in range(0, len(data), frame_size)), frame_size
12
+
13
+ def detect_activity(energies: list):
14
+ if sum(energies) < len(energies) / 12:
15
+ return False
16
+ count = 0
17
+ for energy in energies:
18
+ if energy == 1:
19
+ count += 1
20
+ if count == 12:
21
+ return True
22
+ else:
23
+ count = 0
24
+ return False
25
+
26
+ class Client:
27
+ def __init__(self, sid, client_id, call_id=None):
28
+ self.sid = sid
29
+ self.client_id = client_id
30
+ self.call_id = call_id
31
+ self.buffer = bytearray()
32
+ self.output_path = self.sid + "_output_audio.wav"
33
+ self.target_language = None
34
+ self.original_sr = None
35
+ self.vad = EnergyVAD(
36
+ sample_rate=TARGET_SAMPLING_RATE,
37
+ frame_length=25,
38
+ frame_shift=20,
39
+ energy_threshold=0.05,
40
+ pre_emphasis=0.95,
41
+ ) # PM - Default values given in the docs for this class
42
+
43
+ def add_bytes(self, new_bytes):
44
+ self.buffer += new_bytes
45
+
46
+ def resample_and_write_to_file(self):
47
+ print("Audio being written to file....\n")
48
+ with wave.open(self.sid + "_OG.wav", "wb") as wf:
49
+ wf.setnchannels(1)
50
+ wf.setsampwidth(2)
51
+ wf.setframerate(self.original_sr)
52
+ wf.setnframes(0)
53
+ wf.setcomptype("NONE", "not compressed")
54
+ wf.writeframes(self.buffer)
55
+ waveform, sample_rate = torchaudio.load(self.sid + "_OG.wav")
56
+ resampler = torchaudio.transforms.Resample(sample_rate, TARGET_SAMPLING_RATE, dtype=waveform.dtype)
57
+ resampled_waveform = resampler(waveform)
58
+ # torchaudio.save(self.output_path, resampled_waveform, TARGET_SAMPLING_RATE)
59
+ vad_waveform = self.vad(resampled_waveform)
60
+ # print(vad_waveform) # debugging
61
+ self.buffer = bytearray()
62
+ return detect_activity(vad_waveform), resampled_waveform
63
+
64
+ def get_length(self):
65
+ return len(self.buffer)
66
+
67
+ def __del__(self):
68
+ if len(self.buffer) > 0:
69
+ print(f"🚨 [ClientAudioBuffer] Buffer not empty for {self.sid} ({len(self.buffer)} bytes)!")
70
+ if os.path.exists(self.output_path):
71
+ os.remove(self.output_path)
72
+ if os.path.exists(self.sid + "_OG.wav"):
73
+ os.remove(self.sid + "_OG.wav")
74
+
backend/__pycache__/Client.cpython-310.pyc ADDED
Binary file (2.99 kB). View file
 
backend/__pycache__/main.cpython-310.pyc ADDED
Binary file (8.55 kB). View file
 
backend/backend.log ADDED
The diff for this file is too large to render. See raw diff
 
backend/main.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+ import os
3
+ from datetime import datetime
4
+ import uvicorn
5
+ from typing import Any, Optional, Tuple, Dict, TypedDict
6
+ from urllib import parse
7
+ from uuid import uuid4
8
+ import logging
9
+ ###############################################
10
+ # Configure logger
11
+ ###############################################
12
+ logging.basicConfig(filename="backend.log",
13
+ filemode='w',
14
+ format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
15
+ datefmt='%H:%M:%S',
16
+ level=logging.DEBUG)
17
+
18
+ logger = logging.getLogger("socketio_server_pubsub")
19
+ logger.propagate = True
20
+
21
+ import sys
22
+ # sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/')
23
+
24
+ from fastapi import FastAPI
25
+ from fastapi.middleware.cors import CORSMiddleware
26
+ from pymongo import MongoClient
27
+ from dotenv import dotenv_values
28
+ from routes import router as api_router
29
+ from contextlib import asynccontextmanager
30
+ import requests
31
+
32
+ from typing import List
33
+ from datetime import date
34
+ from mongodb.operations.calls import *
35
+ from mongodb.models.calls import UserCall, UpdateCall
36
+ # from mongodb.endpoints.calls import *
37
+
38
+ from transformers import AutoProcessor, SeamlessM4Tv2Model
39
+
40
+ # from seamless_communication.inference import Translator
41
+ from Client import Client
42
+ #----------------------------------
43
+ # base seamless imports
44
+ # ---------------------------------
45
+ import numpy as np
46
+ import torch
47
+ # ---------------------------------
48
+ import socketio
49
+
50
+ DEBUG = True
51
+
52
+ ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME = "remove_server_lock"
53
+
54
+ TARGET_SAMPLING_RATE = 16000
55
+ MAX_BYTES_BUFFER = 480_000
56
+
57
+ print("")
58
+ print("")
59
+ print("=" * 20 + " ⭐️ Starting Server... ⭐️ " + "=" * 20)
60
+
61
+ ###############################################
62
+ # Configure socketio server
63
+ ###############################################
64
+
65
+ # TODO PM - change this to the actual path
66
+ # seamless remnant code
67
+ CLIENT_BUILD_PATH = "../streaming-react-app/dist/"
68
+ static_files = {
69
+ "/": CLIENT_BUILD_PATH,
70
+ "/assets/seamless-db6a2555.svg": {
71
+ "filename": CLIENT_BUILD_PATH + "assets/seamless-db6a2555.svg",
72
+ "content_type": "image/svg+xml",
73
+ },
74
+ }
75
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
76
+ processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
77
+ # PM - hardcoding temporarily as my GPU doesnt have enough vram
78
+ model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large").to("cpu")
79
+
80
+ config = dotenv_values(".env")
81
+
82
+ # Read connection string from environment vars
83
+ uri = os.environ['MONGODB_URI']
84
+
85
+ # Read connection string from .env file
86
+ # uri = config['MONGODB_URI']
87
+
88
+ # MongoDB Connection Lifespan Events
89
+ @asynccontextmanager
90
+ async def lifespan(app: FastAPI):
91
+ # startup logic
92
+ app.mongodb_client = MongoClient(uri)
93
+ app.database = app.mongodb_client['IT-Cluster1'] #connect to interpretalk primary db
94
+ try:
95
+ app.mongodb_client.admin.command('ping')
96
+ print("MongoDB Connection Established...")
97
+ except Exception as e:
98
+ print(e)
99
+
100
+ yield
101
+
102
+ # shutdown logic
103
+ print("Closing MongoDB Connection...")
104
+ app.mongodb_client.close()
105
+
106
+ app = FastAPI(lifespan=lifespan, logger=logger)
107
+
108
+ # New CORS funcitonality
109
+ app.add_middleware(
110
+ CORSMiddleware,
111
+ allow_origins=["*"], # configured node app port
112
+ allow_credentials=True,
113
+ allow_methods=["*"],
114
+ allow_headers=["*"],
115
+ )
116
+
117
+ app.include_router(api_router) # include routers for user, calls and transcripts operations
118
+
119
+
120
+ # sio is the main socket.io entrypoint
121
+ sio = socketio.AsyncServer(
122
+ async_mode="asgi",
123
+ cors_allowed_origins="*",
124
+ logger=logger,
125
+ engineio_logger=logger,
126
+ )
127
+ # sio.logger.setLevel(logging.DEBUG)
128
+ socketio_app = socketio.ASGIApp(sio)
129
+ # app.mount("/", socketio_app)
130
+
131
+ from fastapi import APIRouter, Body, Request, status
132
+
133
+ bytes_data = bytearray()
134
+ model_name = "seamlessM4T_v2_large"
135
+ vocoder_name = "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"
136
+
137
+ clients = {}
138
+ rooms = {}
139
+
140
+
141
+ def get_collection_users():
142
+ return app.database["user_records"]
143
+
144
+ def get_collection_calls():
145
+ # return app.database["call_records"]
146
+ return app.database["call_test"]
147
+
148
+
149
+ @app.get("/test/", response_description="List all existing call records", response_model=List[UserCall])
150
+ def test():
151
+
152
+ result = list_calls(get_collection_calls(), 100)
153
+
154
+ # return {"message": "Welcome to InterpreTalk!"}
155
+
156
+ print(result)
157
+ return result
158
+
159
+
160
+ @app.put("/test_put/", response_description="List all existing call records", response_model=UserCall)
161
+ def test_put():
162
+
163
+ # result = list_calls(get_collection_calls(), 100)
164
+ # result = send_captions("TEST", "TEST", "TEST", "oUjUxTYTQFVVjEarIcZ0")
165
+ result = send_captions("TEST", "TEST", "TEST", "TESTID000001")
166
+
167
+ print(result)
168
+ return result
169
+
170
+
171
+ @app.post("/test_post/", response_description="List all existing call records", response_model=UserCall)
172
+ def test_post():
173
+ request_data = {
174
+ "call_id": "TESTID000001"
175
+ }
176
+
177
+ result = create_calls(get_collection_calls(), request_data)
178
+
179
+ # return {"message": "Welcome to InterpreTalk!"}
180
+ return result
181
+
182
+
183
+ async def send_translated_text(client_id, original_text, translated_text, room_id):
184
+ print('SEND_TRANSLATED_TEXT IS WOKRING IN FASTAPI BACKEND...')
185
+ print(rooms)
186
+ print(clients)
187
+
188
+ data = {
189
+ "author": str(client_id),
190
+ "original_text": str(original_text),
191
+ "translated_text": str(translated_text),
192
+ "timestamp": str(datetime.now())
193
+ }
194
+ logger.warning("SENDING TRANSLATED TEXT TO CLIENT")
195
+ await sio.emit("translated_text", data, room=room_id)
196
+ logger.warning("SUCCESSFULLY SEND AUDIO TO FRONTEND")
197
+
198
+ @sio.on("connect")
199
+ async def connect(sid, environ):
200
+ print(f"📥 [event: connected] sid={sid}")
201
+ query_params = dict(parse.parse_qsl(environ["QUERY_STRING"]))
202
+ client_id = query_params.get("client_id")
203
+ logger.info(f"📥 [event: connected] sid={sid}, client_id={client_id}")
204
+ # sid = socketid, client_id = client specific ID ,always the same for same user
205
+ clients[sid] = Client(sid, client_id)
206
+ logger.warning(f"Client connected: {sid}")
207
+ logger.warning(clients)
208
+
209
+ @sio.on("disconnect")
210
+ async def disconnect(sid): # BO - also pass call id as parameter for updating MongoDB
211
+ logger.debug(f"📤 [event: disconnected] sid={sid}")
212
+ clients.pop(sid, None)
213
+ # BO -> Update Call record with call duration, key terms
214
+
215
+ @sio.on("target_language")
216
+ async def target_language(sid, target_lang):
217
+ logger.info(f"📥 [event: target_language] sid={sid}, target_lang={target_lang}")
218
+ clients[sid].target_language = target_lang
219
+
220
+ @sio.on("call_user")
221
+ async def call_user(sid, call_id):
222
+ clients[sid].call_id = call_id
223
+ logger.warning(f"CALL {sid}: entering room {call_id}")
224
+ rooms[call_id] = rooms.get(call_id, [])
225
+ if sid not in rooms[call_id] and len(rooms[call_id]) < 2:
226
+ rooms[call_id].append(sid)
227
+ sio.enter_room(sid, call_id)
228
+ else:
229
+ logger.warning(f"CALL {sid}: room {call_id} is full")
230
+ # await sio.emit("room_full", room=call_id, to=sid)
231
+
232
+ # # BO - Get call id from dictionary created during socketio connection
233
+ # client_id = clients[sid].client_id
234
+
235
+ # logger.warning(f"NOW TRYING TO CREATE DB RECORD FOR Caller with ID: {client_id} for call: {call_id}")
236
+ # # # BO -> Create Call Record with Caller and call_id field (None for callee, duration, terms..)
237
+ # request_data = {
238
+ # "call_id": str(call_id),
239
+ # "caller_id": str(client_id),
240
+ # "creation_date": str(datetime.now())
241
+ # }
242
+
243
+ # response = create_calls(get_collection_calls(), request_data)
244
+ # print(response) # BO - print created db call record
245
+
246
+ @sio.on("audio_config")
247
+ async def audio_config(sid, sample_rate):
248
+ clients[sid].original_sr = sample_rate
249
+
250
+
251
+ @sio.on("answer_call")
252
+ async def answer_call(sid, call_id):
253
+
254
+ clients[sid].call_id = call_id
255
+ logger.warning(f"ANSWER {sid}: entering room {call_id}")
256
+ rooms[call_id] = rooms.get(call_id, [])
257
+ if sid not in rooms[call_id] and len(rooms[call_id]) < 2:
258
+ rooms[call_id].append(sid)
259
+ sio.enter_room(sid, call_id)
260
+ else:
261
+ logger.warning(f"ANSWER {sid}: room {call_id} is full")
262
+ # await sio.emit("room_full", room=call_id, to=sid)
263
+
264
+
265
+ # # BO - Get call id from dictionary created during socketio connection
266
+ # client_id = clients[sid].client_id
267
+
268
+ # # BO -> Update Call Record with Callee field based on call_id
269
+ # logger.warning(f"NOW UPDATING MongoDB RECORD FOR Caller with ID: {client_id} for call: {call_id}")
270
+ # # # BO -> Create Call Record with callee_id field (None for callee, duration, terms..)
271
+ # request_data = {
272
+ # "callee_id": client_id
273
+ # }
274
+
275
+ # response = update_calls(get_collection_calls(), call_id, request_data)
276
+ # print(response) # BO - print created db call record
277
+
278
+
279
+ @sio.on("incoming_audio")
280
+ async def incoming_audio(sid, data, call_id):
281
+ try:
282
+ clients[sid].add_bytes(data)
283
+
284
+ if clients[sid].get_length() >= MAX_BYTES_BUFFER:
285
+ logger.warning('Buffer full, now outputting...')
286
+ output_path = clients[sid].output_path
287
+ vad_result, resampled_audio = clients[sid].resample_and_write_to_file()
288
+ # source lang is speakers tgt language 😃
289
+ src_lang = clients[sid].target_language
290
+ if vad_result:
291
+ logger.warning('Speech detected, now processing audio.....')
292
+ tgt_sid = next(id for id in rooms[call_id] if id != sid)
293
+ tgt_lang = clients[tgt_sid].target_language
294
+ # following example from https://github.com/facebookresearch/seamless_communication/blob/main/docs/m4t/README.md#transformers-usage
295
+ output_tokens = processor(audios=resampled_audio, src_lang=src_lang, return_tensors="pt")
296
+ model_output = model.generate(**output_tokens, tgt_lang=src_lang, generate_speech=False)[0].tolist()[0]
297
+ asr_text = processor.decode(model_output, skip_special_tokens=True)
298
+ print(f"ASR TEXT = {asr_text}")
299
+ # ASR TEXT => ORIGINAL TEXT
300
+
301
+ t2t_tokens = processor(text=asr_text, src_lang=src_lang, tgt_lang=tgt_lang, return_tensors="pt")
302
+ print(f"FIRST TYPE = {type(output_tokens)}, SECOND TYPE = {type(t2t_tokens)}")
303
+ translated_data = model.generate(**t2t_tokens, tgt_lang=tgt_lang, generate_speech=False)[0].tolist()[0]
304
+ translated_text = processor.decode(translated_data, skip_special_tokens=True)
305
+ print(f"TRANSLATED TEXT = {translated_text}")
306
+
307
+ # BO -> send translated_text to mongodb as caption record update based on call_id
308
+ # send_captions(clients[sid].client_id, asr_text, translated_text, call_id)
309
+
310
+ # TRANSLATED TEXT
311
+ # PM - text_output is a list with 1 string
312
+ await send_translated_text(clients[sid].client_id, asr_text, translated_text, call_id)
313
+
314
+ # # BO -> send translated_text to mongodb as caption record update based on call_id
315
+ # send_captions(clients[sid].client_id, asr_text, translated_text, call_id)
316
+
317
+ except Exception as e:
318
+ logger.error(f"Error in incoming_audio: {e.with_traceback()}")
319
+
320
+ def send_captions(client_id, original_text, translated_text, call_id):
321
+ # BO -> Update Call Record with Callee field based on call_id
322
+ print(f"Now updating Caption field in call record for Caller with ID: {client_id} for call: {call_id}")
323
+
324
+ data = {
325
+ "author": str(client_id),
326
+ "original_text": str(original_text),
327
+ "translated_text": str(translated_text),
328
+ "timestamp": str(datetime.now())
329
+ }
330
+
331
+ response = update_captions(get_collection_calls(), call_id, data)
332
+ return response
333
+
334
+ app.mount("/", socketio_app)
335
+
336
+ if __name__ == '__main__':
337
+ uvicorn.run("main:app", host='127.0.0.1', port=8080, log_level="info")
338
+
backend/models/Seamless/vad_s2st_sc_24khz_main.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ agent_class: seamless_communication.streaming.agents.seamless_s2st.SeamlessS2STDualVocoderVADAgent
2
+ monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
3
+ unity_model_name: seamless_streaming_unity
4
+ sentencepiece_model: spm_256k_nllb100.model
5
+
6
+ task: s2st
7
+ tgt_lang: "eng"
8
+ min_unit_chunk_size: 50
9
+ decision_threshold: 0.7
10
+ no_early_stop: True
11
+ block_ngrams: True
12
+ vocoder_name: vocoder_v2
13
+ expr_vocoder_name: vocoder_pretssel
14
+ gated_model_dir: .
15
+ expr_vocoder_gain: 3.0
16
+ upstream_idx: 1
17
+ wav2vec_yaml: wav2vec.yaml
18
+ min_starting_wait_w2vbert: 192
19
+
20
+ config_yaml: cfg_fbank_u2t.yaml
21
+ upstream_idx: 1
22
+ detokenize_only: True
23
+ device: cuda:0
24
+ max_len_a: 0
25
+ max_len_b: 1000
backend/models/SeamlessStreaming/vad_s2st_sc_main.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ agent_class: seamless_communication.streaming.agents.seamless_streaming_s2st.SeamlessStreamingS2STJointVADAgent
2
+ monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
3
+ unity_model_name: seamless_streaming_unity
4
+ sentencepiece_model: spm_256k_nllb100.model
5
+
6
+ task: s2st
7
+ tgt_lang: "eng"
8
+ min_unit_chunk_size: 50
9
+ decision_threshold: 0.7
10
+ no_early_stop: True
11
+ block_ngrams: True
12
+ vocoder_name: vocoder_v2
13
+ wav2vec_yaml: wav2vec.yaml
14
+ min_starting_wait_w2vbert: 192
15
+
16
+ config_yaml: cfg_fbank_u2t.yaml
17
+ upstream_idx: 1
18
+ detokenize_only: True
19
+ device: cuda:0
20
+ max_len_a: 0
21
+ max_len_b: 1000
backend/mongodb/endpoints/__pycache__/calls.cpython-310.pyc ADDED
Binary file (2.9 kB). View file
 
backend/mongodb/endpoints/__pycache__/users.cpython-310.pyc ADDED
Binary file (1.94 kB). View file
 
backend/mongodb/endpoints/calls.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Body, Request, status
2
+ from typing import List
3
+ from datetime import date
4
+
5
+ import sys
6
+ # sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
7
+
8
+ from ..operations import calls as calls
9
+ from ..models.calls import UserCall, UpdateCall
10
+
11
+ router = APIRouter(prefix="/call",
12
+ tags=["Calls"])
13
+
14
+ @router.post("/create-call", response_description="Create a new user call record", status_code=status.HTTP_201_CREATED, response_model=UserCall)
15
+ async def create_calls(request: Request, user_calls: UserCall = Body(...)):
16
+ return calls.create_calls(request, user_calls)
17
+
18
+ @router.get("/list-call", response_description="List all existing call records", response_model=List[UserCall])
19
+ async def list_calls(request: Request, limit: int):
20
+ return calls.list_calls(request, 100)
21
+
22
+ @router.get("/find-call/{call_id}", response_description="Find user's calls based on User ID", response_model=UserCall)
23
+ async def find_calls(request: Request, call_id: str):
24
+ return calls.find_calls(request, call_id)
25
+
26
+ @router.get("/find-user/{user_id}", response_description="Find user's calls based on User ID", response_model=List[UserCall])
27
+ async def find_user_calls(request: Request, user_id: str):
28
+ return calls.find_user_calls(request, user_id)
29
+
30
+ '''Key terms list can have variable length -> using POST request over GET'''
31
+ @router.post("/find-term/", response_description="Find calls based on key term list", response_model=List[UserCall])
32
+ async def list_transcripts_by_key_terms(request: Request, key_terms: List[str]):
33
+ return calls.list_transcripts_by_key_terms(request, key_terms)
34
+
35
+ @router.get("/find-date/{start_date}/{end_date}", response_description="Find calls based on date ranges", response_model=List[UserCall])
36
+ async def list_transcripts_by_dates(request: Request, start_date: str, end_date: str):
37
+ return calls.list_transcripts_by_dates(request, start_date, end_date)
38
+
39
+ @router.get("/find-duration/{min_len}/{max_len}", response_description="Find calls based on call duration in minutes", response_model=List[UserCall])
40
+ async def list_transcripts_by_duration(request: Request, min_len: int, max_len: int):
41
+ return calls.list_transcripts_by_duration(request, min_len, max_len)
42
+
43
+ @router.put("/update-call/{call_id}", response_description="Update an existing call", response_model=UpdateCall)
44
+ async def update_calls(request: Request, call_id: str, user_calls: UpdateCall = Body(...)):
45
+ return calls.update_calls(request, call_id, user_calls)
46
+
47
+ @router.put("/update-captions/{call_id}", response_description="Update an existing call", response_model=UpdateCall)
48
+ async def update_captions(request: Request, call_id: str, user_calls: UpdateCall = Body(...)):
49
+ return calls.update_captions(request, call_id, user_calls)
50
+
51
+ @router.delete("/delete-call/{call_id}", response_description="Delete a call by its id")
52
+ async def delete_call(request: Request, call_id :str):
53
+ return calls.delete_calls(request, call_id)
backend/mongodb/endpoints/users.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Body, Request, status
2
+ from typing import List
3
+ import sys
4
+ # sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
5
+ from ..models.users import User, UpdateUser
6
+ from ..operations import users as users
7
+
8
+ router = APIRouter(prefix="/user",
9
+ tags=["User"])
10
+
11
+ @router.post("/", response_description="Create a new user", status_code=status.HTTP_201_CREATED, response_model=User)
12
+ async def create_user(request: Request, user: User = Body(...)):
13
+ return users.create_user(request,user)
14
+
15
+ @router.get("/", response_description="List users", response_model=List[User])
16
+ async def list_users(request: Request):
17
+ return users.list_users(request, 100)
18
+
19
+ @router.put("/{user_id}", response_description="Update a User", response_model=UpdateUser)
20
+ async def update_user(request: Request, user_id: str, user: UpdateUser = Body(...)):
21
+ return users.update_user(request, user_id, user)
22
+
23
+ @router.get("/{user_id}", response_description="Get a single user by id", response_model=User)
24
+ async def find_user(request: Request, user_id: str):
25
+ return users.find_user(request, user_id)
26
+
27
+ @router.get("/name/{user_name}", response_description="Get a single user by name", response_model=User)
28
+ async def find_user_name(request: Request, name: str):
29
+ return users.find_user_name(request, name)
30
+
31
+ @router.get("/email/{email_addr}", response_description="Get a single user by email", response_model=User)
32
+ async def find_user_email(request: Request, email: str):
33
+ return users.find_user_email(request, email)
34
+
35
+ @router.delete("/{user_id}", response_description="Delete a user")
36
+ async def delete_user(request: Request, user_id:str):
37
+ return users.delete_user(request, user_id)
backend/mongodb/models/__pycache__/calls.cpython-310.pyc ADDED
Binary file (2.51 kB). View file
 
backend/mongodb/models/__pycache__/users.cpython-310.pyc ADDED
Binary file (1.83 kB). View file
 
backend/mongodb/models/calls.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ from typing import List, Dict, Optional
3
+ from datetime import datetime
4
+ from pydantic import BaseModel, Field, PrivateAttr
5
+
6
+ import sys
7
+ # sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
8
+
9
+ # from pydantic_mongo import ObjectIdField
10
+
11
+
12
+ ''' Class for storing captions generated by SeamlessM4T'''
13
+ class UserCaptions(BaseModel):
14
+ _id: uuid.UUID = PrivateAttr(default_factory=uuid.uuid4) # private attr not included in http calls
15
+ author: str
16
+ original_text: str
17
+ translated_text: str
18
+ timestamp: datetime = Field(default_factory=datetime.now)
19
+
20
+
21
+ '''Class for storing past call records from users'''
22
+ class UserCall(BaseModel):
23
+ _id: uuid.UUID = PrivateAttr(default_factory=uuid.uuid4)
24
+ call_id: Optional[str] = None
25
+ caller_id: Optional[str] = None
26
+ callee_id: Optional[str] = None
27
+ creation_date: datetime = Field(default_factory=datetime.now, alias="date")
28
+ duration: Optional[int] = None # milliseconds
29
+ captions: Optional[List[UserCaptions]] = None
30
+ # captions: List[Dict[str, str | float]]
31
+ key_terms: Optional[List[str]] = None
32
+
33
+ '''Implement validation check on transcript ID if in transcript records'''
34
+ # @validator('transcript_id')
35
+ # def check_transcript_id_transcripts_collection(cls,transcript_id):
36
+ # transcript_ids=['',''] # check transcript_ids in collection
37
+ # if transcript_id not in transcript_ids:
38
+ # raise ValueError(f'transcript ID must be in {transcript_ids}')
39
+ # return transcript_id
40
+
41
+ class Config:
42
+ populate_by_name = True
43
+ json_schema_extra = {
44
+ "example": {
45
+ "call_id": "65eef930e9abd3b1e3506906",
46
+ "caller_id": "65ede65b6d246e52aaba9d4f",
47
+ "callee_id": "65edda944340ac84c1f00758",
48
+ "duration": 360,
49
+ "captions": [{"author": "shamzino", "original_text": "eng: This is original_text english text", "translated_text": "spa: este es el texto traducido al español", "timestamp": "2024-03-28T16:15:50.956055"},
50
+ {"author": "benjino", "original_text": "eng: This is source english text", "translated_text": "spa: este es el texto fuente al español", "timestamp": "2024-03-28T16:16:20.34625"}],
51
+ "key_terms": ["original_text", "source", "english", "text"]
52
+ }
53
+ }
54
+
55
+
56
+ ''' Class for updating User Call record'''
57
+ class UpdateCall(BaseModel):
58
+ call_id: Optional[str] = None
59
+ caller_id: Optional[str] = None
60
+ callee_id: Optional[str] = None
61
+ duration: Optional[int] = None
62
+ captions: Optional[List[UserCaptions]] = None
63
+ key_terms: Optional[List[str]] = None
64
+
65
+ class Config:
66
+ populate_by_name = True
67
+ json_schema_extra = {
68
+ "example": {
69
+ "duration": "500"
70
+ }
71
+ }
backend/mongodb/models/users.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ from typing import List, Optional
3
+ from pydantic import BaseModel, Field, SecretStr, PrivateAttr
4
+ from pydantic.networks import EmailStr
5
+
6
+ from pydantic_mongo import ObjectIdField
7
+
8
+ '''Class for user model used to relate users to past calls'''
9
+ class User(BaseModel):
10
+ _id: uuid.UUID = PrivateAttr(default_factory=uuid.uuid4) # private attr not included in http calls
11
+ user_id: str
12
+ name: str
13
+ email: EmailStr = Field(unique=True, index=True)
14
+ password: SecretStr
15
+ call_ids: Optional[List[str]] = None
16
+
17
+ class Config:
18
+ populate_by_name = True
19
+ json_schema_extra = {
20
+ "example": {
21
+ "user_id": "65ede65b6d246e52aaba9d4f",
22
+ "name": "benjolo",
23
+ "email": "[email protected]",
24
+ "password": "therealbenjolo",
25
+ "call_ids": ["65e205ced1be3a22854ff300", "65df8c3eba9c7c2ed1b20e85"]
26
+ }
27
+ }
28
+
29
+ '''Class for updating user records'''
30
+ class UpdateUser(BaseModel):
31
+ user_id: Optional[str] = None
32
+ name: Optional[str] = None
33
+ email: Optional[EmailStr] = None
34
+ ''' To decode use -> SecretStr("abc").get_secret_value()'''
35
+ # password: Optional[SecretStr]
36
+ call_ids: Optional[List[str]] = None
37
+
38
+ class Config:
39
+ populate_by_name = True
40
+ json_schema_extra = {
41
+ "example": {
42
+ "email": "[email protected]",
43
+ "call_ids": ["65e205ced1be3a22854ff300", "65df8c3eba9c7c2ed1b20e85", "65eef930e9abd3b1e3506906"]
44
+ }
45
+ }
46
+
backend/mongodb/operations/__pycache__/calls.cpython-310.pyc ADDED
Binary file (4.26 kB). View file
 
backend/mongodb/operations/__pycache__/users.cpython-310.pyc ADDED
Binary file (2.98 kB). View file
 
backend/mongodb/operations/calls.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import Body, Request, HTTPException, status
2
+ from fastapi.encoders import jsonable_encoder
3
+ import sys
4
+ # sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
5
+ from ..models.calls import UpdateCall, UserCall, UserCaptions
6
+
7
+ def get_collection_calls(request: Request):
8
+ try:
9
+ # return request.app.database["call_records"]
10
+ return request.app.database["call_test"]
11
+ except:
12
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Unable to find call records Database.")
13
+
14
+
15
+ def create_calls(collection, user: UserCall = Body(...)):
16
+ calls = jsonable_encoder(user)
17
+ # new_calls = get_collection_calls(request).insert_one(calls)
18
+ new_calls = collection.insert_one(calls)
19
+ # created_calls = get_collection_calls(request).find_one({"_id": new_calls.inserted_id})
20
+ created_calls = collection.find_one({"_id": new_calls.inserted_id})
21
+
22
+ return created_calls
23
+
24
+
25
+ def list_calls(request: Request, limit: int):
26
+ try:
27
+ calls = list(get_collection_calls(request).find(limit = limit))
28
+ # dateTest = calls[2]['date']
29
+ return calls
30
+ except:
31
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No existing call records yet.")
32
+
33
+
34
+ '''Finding calls based on call id'''
35
+ def find_calls(request: Request, call_id: str):
36
+ # if user_calls := get_collection_calls(request).find_one({"call_id": call_id}) is not None:
37
+ user_calls = get_collection_calls(request).find_one({"call_id": call_id})
38
+ if user_calls is not None:
39
+ return user_calls
40
+ else:
41
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with ID: '{call_id}' not found.")
42
+
43
+
44
+ '''Finding calls based on user id'''
45
+ def find_user_calls(request: Request, user_id: str):
46
+ user_calls = list(get_collection_calls(request).find({"$or": [{"caller_id": user_id}, {"callee_id": user_id}]})) # match on caller or callee ID
47
+ if len(user_calls):
48
+ return user_calls
49
+ else:
50
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with ID: '{user_id}' has no calls yet.")
51
+
52
+
53
+ '''Finding calls based on key terms list'''
54
+ def list_transcripts_by_key_terms(request: Request, key_terms_list: list[str] = Body(...)):
55
+ key_terms_list = jsonable_encoder(key_terms_list)
56
+
57
+ call_records = list(get_collection_calls(request).find({"key_terms": {"$in": key_terms_list}}, {'_id': 0})) # exclude returning ObjectID in find()
58
+
59
+ # Check if any call records were returned
60
+ if len(call_records):
61
+ return call_records
62
+ else:
63
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with key terms: '{key_terms_list}' not found!")
64
+
65
+
66
+ '''Finding calls based on date ranges'''
67
+ def list_transcripts_by_dates(request: Request, start_date: str, end_date: str):
68
+ print(start_date, end_date)
69
+ # Convert strings to date string in YYYY-MM-ddT00:00:00 format
70
+ start_date = f'{start_date}T00:00:00'
71
+ end_date = f'{end_date}T00:00:00'
72
+
73
+ call_records = list(get_collection_calls(request).find({"date":{"$gte": start_date, "$lte": end_date}}))
74
+
75
+ if len(call_records):
76
+ return call_records
77
+ else:
78
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with creation date between: '{start_date} - {end_date}' not found!")
79
+
80
+
81
+ '''Finding calls based on call lengths'''
82
+ def list_transcripts_by_duration(request: Request, min_len: int, max_len: int):
83
+
84
+ call_records = list(get_collection_calls(request).find({"duration":{"$gte": min_len, "$lte": max_len}}))
85
+
86
+ if len(call_records):
87
+ return call_records
88
+ else:
89
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with duration between: '{min_len} - {max_len}' milliseconds not found!")
90
+
91
+
92
+ def update_calls(collection, call_id: str, calls: UpdateCall = Body(...)):
93
+ # calls = {k: v for k, v in calls.model_dump().items() if v is not None} #loop in the dict
94
+ calls = {k: v for k, v in calls.items() if v is not None} #loop in the dict
95
+ if len(calls) >= 1:
96
+ update_result = collection.update_one({"call_id": call_id}, {"$set": calls})
97
+
98
+ if update_result.modified_count == 0:
99
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not updated!")
100
+
101
+ if (existing_item := collection.find_one({"call_id": call_id})) is not None:
102
+ return existing_item
103
+
104
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not found!")
105
+
106
+ def update_captions(collection, call_id: str, calls: UserCaptions = Body(...)):
107
+ # calls = {k: v for k, v in calls.model_dump().items() if v is not None}
108
+ calls = {k: v for k, v in calls.items() if v is not None}
109
+ if len(calls) >= 1:
110
+ update_result = collection.update_one({"call_id": call_id},
111
+ {"$push": {"captions": calls}})
112
+
113
+ if update_result.modified_count == 0:
114
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Captions not updated!")
115
+
116
+ if (existing_item := collection.find_one({"call_id": call_id})) is not None:
117
+ return existing_item
118
+
119
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Captions not found!")
120
+
121
+
122
+ def delete_calls(request: Request, call_id: str):
123
+ deleted_calls = get_collection_calls(request).delete_one({"call_id": call_id})
124
+
125
+ if deleted_calls.deleted_count == 1:
126
+ return f"Call deleted sucessfully!"
127
+
128
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not found!")
backend/mongodb/operations/users.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import Body, Request, HTTPException, status
2
+ from fastapi.encoders import jsonable_encoder
3
+ import sys
4
+ # sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
5
+ from ..models.users import User, UpdateUser
6
+ from bson import ObjectId
7
+ import re
8
+
9
+
10
+ def get_collection_users(request: Request):
11
+ test = request.app.database["user_records"]
12
+ print(type(test))
13
+ return test
14
+
15
+
16
+ def create_user(request: Request, user: User = Body(...)):
17
+ user = jsonable_encoder(user)
18
+ new_user = get_collection_users(request).insert_one(user)
19
+ created_user = get_collection_users(request).find_one({"_id": new_user.inserted_id})
20
+ print("NEW ID IS:.........", new_user.inserted_id)
21
+ return created_user
22
+
23
+
24
+ def list_users(request: Request, limit: int):
25
+ try:
26
+ users = list(get_collection_users(request).find(limit = limit))
27
+ return users
28
+ except:
29
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No users found!")
30
+
31
+
32
+ def find_user(request: Request, user_id: str):
33
+ if (user := get_collection_users(request).find_one({"user_id": user_id})):
34
+ return user
35
+ else:
36
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id {user_id} not found!")
37
+
38
+
39
+ def find_user_name(request: Request, name: str):
40
+ # search for name in lowercase
41
+ if (user := get_collection_users(request).find_one({"name": re.compile('^' + re.escape(name) + '$', re.IGNORECASE)})):
42
+ return user
43
+ else:
44
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with name {name} not found!")
45
+
46
+
47
+ def find_user_email(request: Request, email: str):
48
+ if (user := get_collection_users(request).find_one({"email": re.compile('^' + re.escape(email) + '$', re.IGNORECASE)})):
49
+ return user
50
+ else:
51
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with Email Address {email} not found!")
52
+
53
+
54
+ ''' Update user record based on user object/json'''
55
+ def update_user(request: Request, user_id: str, user: UpdateUser):
56
+ try:
57
+ user = {k: v for k, v in user.model_dump().items() if v is not None}
58
+ if len(user) >= 1:
59
+ update_result = get_collection_users(request).update_one({"user_id": user_id}, {"$set": user})
60
+
61
+ if update_result.modified_count == 0:
62
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id: '{user_id}' not found and updated!")
63
+
64
+ if (existing_users := get_collection_users(request).find_one({"user_id": user_id})) is not None:
65
+ return existing_users
66
+ except:
67
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id: '{user_id}' not found and updated!")
68
+
69
+
70
+ def delete_user(request: Request, user_id: str):
71
+ try:
72
+ deleted_user = get_collection_users(request).delete_one({"user_id": user_id})
73
+
74
+ if deleted_user.deleted_count == 1:
75
+ return f"User with user_id {user_id} deleted sucessfully"
76
+ except:
77
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id {user_id} not found!")
backend/pcmToWav.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wave
2
+ import os
3
+
4
+
5
+ basePath = os.path.expanduser("~/Desktop/")
6
+
7
+
8
+ def convert_pcm_to_wav():
9
+ # PCM file parameters (should match the parameters used to create the PCM file)
10
+ pcm_file = basePath + 'output.pcm'
11
+ wav_file = 'pcmconverted.wav'
12
+ sample_rate = 16000 # Example: 16000 Hz
13
+ channels = 1 # Example: 2 for stereo
14
+ sample_width = 2 # Example: 2 bytes (16 bits), change if your PCM format is different
15
+
16
+ # Read the PCM file and write to a WAV file
17
+ with open(pcm_file, 'rb') as pcmfile:
18
+ pcm_data = pcmfile.read()
19
+
20
+ with wave.open(wav_file, 'wb') as wavfile:
21
+ wavfile.setnchannels(channels)
22
+ wavfile.setsampwidth(sample_width)
23
+ wavfile.setframerate(sample_rate)
24
+ wavfile.writeframes(pcm_data)
25
+
26
+ convert_pcm_to_wav()
27
+
28
+ # def generateCaptions(filepath):
29
+
30
+ # ! This might be redundant due to seamless-streaming
31
+
32
+
33
+
34
+ print(f"Converted {pcm_file} to {wav_file}")
backend/preprocess_wav.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile
2
+ import io
3
+ from typing import Any, Tuple, Union, Optional
4
+ import numpy as np
5
+ import torch
6
+
7
+ def preprocess_wav(data: Any, incoming_sample_rate) -> Tuple[np.ndarray, int]:
8
+ segment, sample_rate = soundfile.read(
9
+ io.BytesIO(data),
10
+ dtype="float32",
11
+ always_2d=True,
12
+ frames=-1,
13
+ start=0,
14
+ format="RAW",
15
+ subtype="PCM_16",
16
+ samplerate=incoming_sample_rate,
17
+ channels=1,
18
+ )
19
+ return segment, sample_rate
20
+
21
+ def convert_waveform(
22
+ waveform: Union[np.ndarray, torch.Tensor],
23
+ sample_rate: int,
24
+ normalize_volume: bool = False,
25
+ to_mono: bool = False,
26
+ to_sample_rate: Optional[int] = None,
27
+ ) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
28
+ """convert a waveform:
29
+ - to a target sample rate
30
+ - from multi-channel to mono channel
31
+ - volume normalization
32
+
33
+ Args:
34
+ waveform (numpy.ndarray or torch.Tensor): 2D original waveform
35
+ (channels x length)
36
+ sample_rate (int): original sample rate
37
+ normalize_volume (bool): perform volume normalization
38
+ to_mono (bool): convert to mono channel if having multiple channels
39
+ to_sample_rate (Optional[int]): target sample rate
40
+ Returns:
41
+ waveform (numpy.ndarray): converted 2D waveform (channels x length)
42
+ sample_rate (float): target sample rate
43
+ """
44
+ try:
45
+ import torchaudio.sox_effects as ta_sox
46
+ except ImportError:
47
+ raise ImportError("Please install torchaudio: pip install torchaudio")
48
+
49
+ effects = []
50
+ if normalize_volume:
51
+ effects.append(["gain", "-n"])
52
+ if to_sample_rate is not None and to_sample_rate != sample_rate:
53
+ effects.append(["rate", f"{to_sample_rate}"])
54
+ if to_mono and waveform.shape[0] > 1:
55
+ effects.append(["channels", "1"])
56
+ if len(effects) > 0:
57
+ is_np_input = isinstance(waveform, np.ndarray)
58
+ _waveform = torch.from_numpy(waveform) if is_np_input else waveform
59
+ converted, converted_sample_rate = ta_sox.apply_effects_tensor(
60
+ _waveform, sample_rate, effects
61
+ )
62
+ if is_np_input:
63
+ converted = converted.numpy()
64
+ return converted, converted_sample_rate
65
+ return waveform, sample_rate
backend/requirements.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # seamless_communication
4
+ git+https://github.com/facebookresearch/seamless_communication.git
5
+ # ./whl/seamless_communication-1.0.0-py3-none-any.whl
6
+ Flask==2.1.3
7
+ Flask_Sockets==0.2.1
8
+ g2p_en==2.1.0
9
+ gevent==22.10.2
10
+ gevent_websocket==0.10.1
11
+ librosa==0.9.2
12
+ numpy==1.24.4
13
+ openai_whisper==20230124
14
+ protobuf==4.24.2
15
+ psola==0.0.1
16
+ pydub==0.25.1
17
+ silero==0.4.1
18
+ soundfile==0.11.0
19
+ stable_ts==1.4.0
20
+ # to be installed by user for desired PyTorch version
21
+ torch==2.1.2
22
+ torchaudio==2.1.2
23
+ # simuleval # to be installed by seamless_communication
24
+ Werkzeug==2.0.3
25
+ whisper==1.1.10
26
+ colorlog==6.7.0
27
+ python-socketio==5.9.0
28
+ uvicorn[standard]==0.23.2
29
+ parallel-wavegan==0.5.5
30
+ python-jose[cryptography]==3.3.0
31
+ starlette==0.32.0.post1
32
+ hf_transfer==0.1.4
33
+ huggingface_hub==0.19.4
34
+ python-socketio==5.9.0
35
+ contextlib2==21.6.0
36
+ pymongo==4.6.2
backend/routes/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from.routing import router
backend/routes/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (235 Bytes). View file
 
backend/routes/__pycache__/routing.cpython-310.pyc ADDED
Binary file (375 Bytes). View file
 
backend/routes/routing.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ import sys
3
+ # sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
4
+ from mongodb.endpoints import users, calls
5
+
6
+ router = APIRouter()
7
+ router.include_router(calls.router)
8
+ router.include_router(users.router)
9
+ # router.include_router(transcripts.router)
backend/seamless/__init__.py ADDED
File without changes
backend/seamless/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (163 Bytes). View file
 
backend/seamless/__pycache__/room.cpython-310.pyc ADDED
Binary file (2.77 kB). View file
 
backend/seamless/__pycache__/simuleval_agent_directory.cpython-310.pyc ADDED
Binary file (5.33 kB). View file
 
backend/seamless/__pycache__/simuleval_transcoder.cpython-310.pyc ADDED
Binary file (14.1 kB). View file
 
backend/seamless/__pycache__/speech_and_text_output.cpython-310.pyc ADDED
Binary file (668 Bytes). View file
 
backend/seamless/__pycache__/transcoder_helpers.cpython-310.pyc ADDED
Binary file (872 Bytes). View file
 
backend/seamless/room.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+ import uuid
3
+
4
+
5
+ class Room:
6
+ def __init__(self, room_id) -> None:
7
+ self.room_id = room_id
8
+ # members is a dict from client_id to Member
9
+ self.members = {}
10
+
11
+ # listeners and speakers are lists of client_id's
12
+ self.listeners = []
13
+ self.speakers = []
14
+
15
+ def __str__(self) -> str:
16
+ return f"Room {self.room_id} ({len(self.members)} member{'s' if len(self.members) == 1 else ''})"
17
+
18
+ def to_json(self):
19
+ varsResult = vars(self)
20
+ # Remember: result is just a shallow copy, so result.members === self.members
21
+ # Because of that, we need to jsonify self.members without writing over result.members,
22
+ # which we do here via dictionary unpacking (the ** operator)
23
+ result = {
24
+ **varsResult,
25
+ "members": {key: value.to_json() for (key, value) in self.members.items()},
26
+ "activeTranscoders": self.get_active_transcoders(),
27
+ }
28
+
29
+ return result
30
+
31
+ def get_active_connections(self):
32
+ return len(
33
+ [m for m in self.members.values() if m.connection_status == "connected"]
34
+ )
35
+
36
+ def get_active_transcoders(self):
37
+ return len([m for m in self.members.values() if m.transcoder is not None])
38
+
39
+ def get_room_status_dict(self):
40
+ return {
41
+ "activeConnections": self.get_active_connections(),
42
+ "activeTranscoders": self.get_active_transcoders(),
43
+ }
44
+
45
+
46
+ class Member:
47
+ def __init__(self, client_id, session_id, name) -> None:
48
+ self.client_id = client_id
49
+ self.session_id = session_id
50
+ self.name = name
51
+ self.connection_status = "connected"
52
+ self.transcoder = None
53
+ self.requested_output_type = None
54
+ self.transcoder_dynamic_config = None
55
+
56
+ def __str__(self) -> str:
57
+ return f"{self.name} (id: {self.client_id[:4]}...) ({self.connection_status})"
58
+
59
+ def to_json(self):
60
+ self_vars = vars(self)
61
+ return {
62
+ **self_vars,
63
+ "transcoder": self.transcoder is not None,
64
+ }
backend/seamless/simuleval_agent_directory.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Creates a directory in which to look up available agents
2
+
3
+ import os
4
+ from typing import List, Optional
5
+ from seamless.simuleval_transcoder import SimulevalTranscoder
6
+ import json
7
+ import logging
8
+
9
+ logger = logging.getLogger("socketio_server_pubsub")
10
+
11
+ # fmt: off
12
+ M4T_P0_LANGS = [
13
+ "eng",
14
+ "arb", "ben", "cat", "ces", "cmn", "cym", "dan",
15
+ "deu", "est", "fin", "fra", "hin", "ind", "ita",
16
+ "jpn", "kor", "mlt", "nld", "pes", "pol", "por",
17
+ "ron", "rus", "slk", "spa", "swe", "swh", "tel",
18
+ "tgl", "tha", "tur", "ukr", "urd", "uzn", "vie",
19
+ ]
20
+ # fmt: on
21
+
22
+
23
+ class NoAvailableAgentException(Exception):
24
+ pass
25
+
26
+
27
+ class AgentWithInfo:
28
+ def __init__(
29
+ self,
30
+ agent,
31
+ name: str,
32
+ modalities: List[str],
33
+ target_langs: List[str],
34
+ # Supported dynamic params are defined in StreamingTypes.ts
35
+ dynamic_params: List[str] = [],
36
+ description="",
37
+ has_expressive: Optional[bool] = None,
38
+ ):
39
+ self.agent = agent
40
+ self.has_expressive = has_expressive
41
+ self.name = name
42
+ self.description = description
43
+ self.modalities = modalities
44
+ self.target_langs = target_langs
45
+ self.dynamic_params = dynamic_params
46
+
47
+ def get_capabilities_for_json(self):
48
+ return {
49
+ "name": self.name,
50
+ "description": self.description,
51
+ "modalities": self.modalities,
52
+ "targetLangs": self.target_langs,
53
+ "dynamicParams": self.dynamic_params,
54
+ }
55
+
56
+ @classmethod
57
+ def load_from_json(cls, config: str):
58
+ """
59
+ Takes in JSON array of models to load in, e.g.
60
+ [{"name": "s2s_m4t_emma-unity2_multidomain_v0.1", "description": "M4T model that supports simultaneous S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]},
61
+ {"name": "s2s_m4t_expr-emma_v0.1", "description": "ES-EN expressive model that supports S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]}]
62
+ """
63
+ configs = json.loads(config)
64
+ agents = []
65
+ for config in configs:
66
+ agent = SimulevalTranscoder.build_agent(config["name"])
67
+ agents.append(
68
+ AgentWithInfo(
69
+ agent=agent,
70
+ name=config["name"],
71
+ modalities=config["modalities"],
72
+ target_langs=config["targetLangs"],
73
+ )
74
+ )
75
+ return agents
76
+
77
+
78
+ class SimulevalAgentDirectory:
79
+ # Available models. These are the directories where the models can be found, and also serve as an ID for the model.
80
+ seamless_streaming_agent = "SeamlessStreaming"
81
+ seamless_agent = "Seamless"
82
+
83
+ def __init__(self):
84
+ self.agents = []
85
+ self.did_build_and_add_agents = False
86
+
87
+ def add_agent(self, agent: AgentWithInfo):
88
+ self.agents.append(agent)
89
+
90
+ def build_agent_if_available(self, model_id, config_name=None):
91
+ agent = None
92
+ try:
93
+ if config_name is not None:
94
+ agent = SimulevalTranscoder.build_agent(
95
+ model_id,
96
+ config_name=config_name,
97
+ )
98
+ else:
99
+ agent = SimulevalTranscoder.build_agent(
100
+ model_id,
101
+ )
102
+ except Exception as e:
103
+ from fairseq2.assets.error import AssetError
104
+ logger.warning("Failed to build agent %s: %s" % (model_id, e))
105
+ if isinstance(e, AssetError):
106
+ logger.warning(
107
+ "Please download gated assets and set `gated_model_dir` in the config"
108
+ )
109
+ raise e
110
+
111
+ return agent
112
+
113
+ def build_and_add_agents(self, models_override=None):
114
+ if self.did_build_and_add_agents:
115
+ return
116
+
117
+ if models_override is not None:
118
+ agent_infos = AgentWithInfo.load_from_json(models_override)
119
+ for agent_info in agent_infos:
120
+ self.add_agent(agent_info)
121
+ else:
122
+ s2s_agent = None
123
+ if os.environ.get("USE_EXPRESSIVE_MODEL", "0") == "1":
124
+ logger.info("Building expressive model...")
125
+ s2s_agent = self.build_agent_if_available(
126
+ SimulevalAgentDirectory.seamless_agent,
127
+ config_name="vad_s2st_sc_24khz_main.yaml",
128
+ )
129
+ has_expressive = True
130
+ else:
131
+ logger.info("Building non-expressive model...")
132
+ s2s_agent = self.build_agent_if_available(
133
+ SimulevalAgentDirectory.seamless_streaming_agent,
134
+ config_name="vad_s2st_sc_main.yaml",
135
+ )
136
+ has_expressive = False
137
+
138
+ if s2s_agent:
139
+ self.add_agent(
140
+ AgentWithInfo(
141
+ agent=s2s_agent,
142
+ name=SimulevalAgentDirectory.seamless_streaming_agent,
143
+ modalities=["s2t", "s2s"],
144
+ target_langs=M4T_P0_LANGS,
145
+ dynamic_params=["expressive"],
146
+ description="multilingual expressive model that supports S2S and S2T",
147
+ has_expressive=has_expressive,
148
+ )
149
+ )
150
+
151
+ if len(self.agents) == 0:
152
+ logger.error(
153
+ "No agents were loaded. This likely means you are missing the actual model files specified in simuleval_agent_directory."
154
+ )
155
+
156
+ self.did_build_and_add_agents = True
157
+
158
+ def get_agent(self, name):
159
+ for agent in self.agents:
160
+ if agent.name == name:
161
+ return agent
162
+ return None
163
+
164
+ def get_agent_or_throw(self, name):
165
+ agent = self.get_agent(name)
166
+ if agent is None:
167
+ raise NoAvailableAgentException("No agent found with name= %s" % (name))
168
+ return agent
169
+
170
+ def get_agents_capabilities_list_for_json(self):
171
+ return [agent.get_capabilities_for_json() for agent in self.agents]
backend/seamless/simuleval_transcoder.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from simuleval.utils.agent import build_system_from_dir
2
+ from typing import Any, List, Optional, Tuple, Union
3
+ import numpy as np
4
+ import soundfile
5
+ import io
6
+ import asyncio
7
+ from simuleval.agents.pipeline import TreeAgentPipeline
8
+ from simuleval.agents.states import AgentStates
9
+ from simuleval.data.segments import Segment, EmptySegment, SpeechSegment
10
+ import threading
11
+ import math
12
+ import logging
13
+ import sys
14
+ from pathlib import Path
15
+ import time
16
+ from g2p_en import G2p
17
+ import torch
18
+ import traceback
19
+ import time
20
+ import random
21
+ import colorlog
22
+
23
+ from .speech_and_text_output import SpeechAndTextOutput
24
+
25
+ MODEL_SAMPLE_RATE = 16_000
26
+
27
+ logger = logging.getLogger(__name__)
28
+ # logger.propagate = False
29
+ handler = colorlog.StreamHandler(stream=sys.stdout)
30
+ formatter = colorlog.ColoredFormatter(
31
+ "%(log_color)s[%(asctime)s][%(levelname)s][%(module)s]:%(reset)s %(message)s",
32
+ reset=True,
33
+ log_colors={
34
+ "DEBUG": "cyan",
35
+ "INFO": "green",
36
+ "WARNING": "yellow",
37
+ "ERROR": "red",
38
+ "CRITICAL": "red,bg_white",
39
+ },
40
+ )
41
+ handler.setFormatter(formatter)
42
+ logger.addHandler(handler)
43
+ logger.setLevel(logging.WARNING)
44
+
45
+
46
+ class OutputSegments:
47
+ def __init__(self, segments: Union[List[Segment], Segment]):
48
+ if isinstance(segments, Segment):
49
+ segments = [segments]
50
+ self.segments: List[Segment] = [s for s in segments]
51
+
52
+ @property
53
+ def is_empty(self):
54
+ return all(segment.is_empty for segment in self.segments)
55
+
56
+ @property
57
+ def finished(self):
58
+ return all(segment.finished for segment in self.segments)
59
+
60
+ def compute_length(self, g2p):
61
+ lengths = []
62
+ for segment in self.segments:
63
+ if segment.data_type == "text":
64
+ lengths.append(len([x for x in g2p(segment.content) if x != " "]))
65
+ elif segment.data_type == "speech":
66
+ lengths.append(len(segment.content) / MODEL_SAMPLE_RATE)
67
+ elif isinstance(segment, EmptySegment):
68
+ continue
69
+ else:
70
+ logger.warning(
71
+ f"Unexpected data_type: {segment.data_type} not in 'speech', 'text'"
72
+ )
73
+ return max(lengths)
74
+
75
+ @classmethod
76
+ def join_output_buffer(
77
+ cls, buffer: List[List[Segment]], output: SpeechAndTextOutput
78
+ ):
79
+ num_segments = len(buffer[0])
80
+ for i in range(num_segments):
81
+ segment_list = [
82
+ buffer[j][i]
83
+ for j in range(len(buffer))
84
+ if buffer[j][i].data_type is not None
85
+ ]
86
+ if len(segment_list) == 0:
87
+ continue
88
+ if len(set(segment.data_type for segment in segment_list)) != 1:
89
+ logger.warning(
90
+ f"Data type mismatch at {i}: {set(segment.data_type for segment in segment_list)}"
91
+ )
92
+ continue
93
+ data_type = segment_list[0].data_type
94
+ if data_type == "text":
95
+ if output.text is not None:
96
+ logger.warning("Multiple text outputs, overwriting!")
97
+ output.text = " ".join([segment.content for segment in segment_list])
98
+ elif data_type == "speech":
99
+ if output.speech_samples is not None:
100
+ logger.warning("Multiple speech outputs, overwriting!")
101
+ speech_out = []
102
+ for segment in segment_list:
103
+ speech_out += segment.content
104
+ output.speech_samples = speech_out
105
+ output.speech_sample_rate = segment.sample_rate
106
+ elif isinstance(segment_list[0], EmptySegment):
107
+ continue
108
+ else:
109
+ logger.warning(
110
+ f"Invalid output buffer data type: {data_type}, expected 'speech' or 'text"
111
+ )
112
+
113
+ return output
114
+
115
+ def __repr__(self) -> str:
116
+ repr_str = str(self.segments)
117
+ return f"{self.__class__.__name__}(\n\t{repr_str}\n)"
118
+
119
+
120
+ class SimulevalTranscoder:
121
+ def __init__(self, agent, sample_rate, debug, buffer_limit):
122
+ self.agent = agent.agent
123
+ self.has_expressive = agent.has_expressive
124
+ self.input_queue = asyncio.Queue()
125
+ self.output_queue = asyncio.Queue()
126
+ self.states = self.agent.build_states()
127
+ if debug:
128
+ self.get_states_root().debug = True
129
+ self.incoming_sample_rate = sample_rate
130
+ self.close = False
131
+ self.g2p = G2p()
132
+
133
+ # buffer all outgoing translations within this amount of time
134
+ self.output_buffer_idle_ms = 5000
135
+ self.output_buffer_size_limit = (
136
+ buffer_limit # phonemes for text, seconds for speech
137
+ )
138
+ self.output_buffer_cur_size = 0
139
+ self.output_buffer: List[List[Segment]] = []
140
+ self.speech_output_sample_rate = None
141
+
142
+ self.last_output_ts = time.time() * 1000
143
+ self.timeout_ms = (
144
+ 30000 # close the transcoder thread after this amount of silence
145
+ )
146
+ self.first_input_ts = None
147
+ self.first_output_ts = None
148
+ self.debug = debug
149
+ self.debug_ts = f"{time.time()}_{random.randint(1000, 9999)}"
150
+ if self.debug:
151
+ debug_folder = Path(__file__).resolve().parent.parent / "debug"
152
+ self.test_incoming_wav = soundfile.SoundFile(
153
+ debug_folder / f"{self.debug_ts}_test_incoming.wav",
154
+ mode="w+",
155
+ format="WAV",
156
+ subtype="PCM_16",
157
+ samplerate=self.incoming_sample_rate,
158
+ channels=1,
159
+ )
160
+ self.get_states_root().test_input_segments_wav = soundfile.SoundFile(
161
+ debug_folder / f"{self.debug_ts}_test_input_segments.wav",
162
+ mode="w+",
163
+ format="WAV",
164
+ samplerate=MODEL_SAMPLE_RATE,
165
+ channels=1,
166
+ )
167
+
168
+ def get_states_root(self) -> AgentStates:
169
+ if isinstance(self.agent, TreeAgentPipeline):
170
+ # self.states is a dict
171
+ return self.states[self.agent.source_module]
172
+ else:
173
+ # self.states is a list
174
+ return self.states[0]
175
+
176
+ def reset_states(self):
177
+ if isinstance(self.agent, TreeAgentPipeline):
178
+ states_iter = self.states.values()
179
+ else:
180
+ states_iter = self.states
181
+ for state in states_iter:
182
+ state.reset()
183
+
184
+ def debug_log(self, *args):
185
+ if self.debug:
186
+ logger.info(*args)
187
+
188
+ @classmethod
189
+ def build_agent(cls, model_path, config_name):
190
+ logger.info(f"Building simuleval agent: {model_path}, {config_name}")
191
+ agent = build_system_from_dir(
192
+ Path(__file__).resolve().parent.parent / f"models/{model_path}",
193
+ config_name=config_name,
194
+ )
195
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
196
+ agent.to(device, fp16=True)
197
+ logger.info(
198
+ f"Successfully built simuleval agent {model_path} on device {device}"
199
+ )
200
+
201
+ return agent
202
+
203
+ def process_incoming_bytes(self, incoming_bytes, dynamic_config):
204
+ # TODO: We probably want to do some validation on dynamic_config to ensure it has what we needs
205
+ segment, sr = self._preprocess_wav(incoming_bytes)
206
+ segment = SpeechSegment(
207
+ content=segment,
208
+ sample_rate=sr,
209
+ tgt_lang=dynamic_config.get("targetLanguage"),
210
+ config=dynamic_config,
211
+ )
212
+ if dynamic_config.get("expressive") is True and self.has_expressive is False:
213
+ logger.warning(
214
+ "Passing 'expressive' but the agent does not support expressive output!"
215
+ )
216
+ # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
217
+ self.input_queue.put_nowait(segment)
218
+
219
+ def get_input_segment(self):
220
+ if self.input_queue.empty():
221
+ return None
222
+ chunk = self.input_queue.get_nowait()
223
+ self.input_queue.task_done()
224
+ return chunk
225
+
226
+ def convert_waveform(
227
+ self,
228
+ waveform: Union[np.ndarray, torch.Tensor],
229
+ sample_rate: int,
230
+ normalize_volume: bool = False,
231
+ to_mono: bool = False,
232
+ to_sample_rate: Optional[int] = None,
233
+ ) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
234
+ """convert a waveform:
235
+ - to a target sample rate
236
+ - from multi-channel to mono channel
237
+ - volume normalization
238
+
239
+ Args:
240
+ waveform (numpy.ndarray or torch.Tensor): 2D original waveform
241
+ (channels x length)
242
+ sample_rate (int): original sample rate
243
+ normalize_volume (bool): perform volume normalization
244
+ to_mono (bool): convert to mono channel if having multiple channels
245
+ to_sample_rate (Optional[int]): target sample rate
246
+ Returns:
247
+ waveform (numpy.ndarray): converted 2D waveform (channels x length)
248
+ sample_rate (float): target sample rate
249
+ """
250
+ try:
251
+ import torchaudio.sox_effects as ta_sox
252
+ except ImportError:
253
+ raise ImportError("Please install torchaudio: pip install torchaudio")
254
+
255
+ effects = []
256
+ if normalize_volume:
257
+ effects.append(["gain", "-n"])
258
+ if to_sample_rate is not None and to_sample_rate != sample_rate:
259
+ effects.append(["rate", f"{to_sample_rate}"])
260
+ if to_mono and waveform.shape[0] > 1:
261
+ effects.append(["channels", "1"])
262
+ if len(effects) > 0:
263
+ is_np_input = isinstance(waveform, np.ndarray)
264
+ _waveform = torch.from_numpy(waveform) if is_np_input else waveform
265
+ converted, converted_sample_rate = ta_sox.apply_effects_tensor(
266
+ _waveform, sample_rate, effects
267
+ )
268
+ if is_np_input:
269
+ converted = converted.numpy()
270
+ return converted, converted_sample_rate
271
+ return waveform, sample_rate
272
+
273
+ def _preprocess_wav(self, data: Any) -> Tuple[np.ndarray, int]:
274
+ segment, sample_rate = soundfile.read(
275
+ io.BytesIO(data),
276
+ dtype="float32",
277
+ always_2d=True,
278
+ frames=-1,
279
+ start=0,
280
+ format="RAW",
281
+ subtype="PCM_16",
282
+ samplerate=self.incoming_sample_rate,
283
+ channels=1,
284
+ )
285
+ if self.debug:
286
+ self.test_incoming_wav.seek(0, soundfile.SEEK_END)
287
+ self.test_incoming_wav.write(segment)
288
+
289
+ segment = segment.T
290
+ segment, new_sample_rate = self.convert_waveform(
291
+ segment,
292
+ sample_rate,
293
+ normalize_volume=False,
294
+ to_mono=True,
295
+ to_sample_rate=MODEL_SAMPLE_RATE,
296
+ )
297
+
298
+ assert MODEL_SAMPLE_RATE == new_sample_rate
299
+ segment = segment.squeeze(axis=0)
300
+ return segment, new_sample_rate
301
+
302
+ def process_pipeline_impl(self, input_segment):
303
+ try:
304
+ with torch.no_grad():
305
+ output_segment = OutputSegments(
306
+ self.agent.pushpop(input_segment, self.states)
307
+ )
308
+ if (
309
+ self.get_states_root().first_input_ts is not None
310
+ and self.first_input_ts is None
311
+ ):
312
+ # TODO: this is hacky
313
+ self.first_input_ts = self.get_states_root().first_input_ts
314
+
315
+ if not output_segment.is_empty:
316
+ self.output_queue.put_nowait(output_segment)
317
+
318
+ if output_segment.finished:
319
+ self.debug_log("OUTPUT SEGMENT IS FINISHED. Resetting states.")
320
+
321
+ self.reset_states()
322
+
323
+ if self.debug:
324
+ # when we rebuild states, this value is reset to whatever
325
+ # is in the system dir config, which defaults debug=False.
326
+ self.get_states_root().debug = True
327
+ except Exception as e:
328
+ logger.error(f"Got exception while processing pipeline: {e}")
329
+ traceback.print_exc()
330
+ return input_segment
331
+
332
+ def process_pipeline_loop(self):
333
+ if self.close:
334
+ return # closes the thread
335
+
336
+ self.debug_log("processing_pipeline")
337
+ while not self.close:
338
+ input_segment = self.get_input_segment()
339
+ if input_segment is None:
340
+ if self.get_states_root().is_fresh_state: # TODO: this is hacky
341
+ time.sleep(0.3)
342
+ else:
343
+ time.sleep(0.03)
344
+ continue
345
+ self.process_pipeline_impl(input_segment)
346
+ self.debug_log("finished processing_pipeline")
347
+
348
+ def process_pipeline_once(self):
349
+ if self.close:
350
+ return
351
+
352
+ self.debug_log("processing pipeline once")
353
+ input_segment = self.get_input_segment()
354
+ if input_segment is None:
355
+ return
356
+ self.process_pipeline_impl(input_segment)
357
+ self.debug_log("finished processing_pipeline_once")
358
+
359
+ def get_output_segment(self):
360
+ if self.output_queue.empty():
361
+ return None
362
+
363
+ output_chunk = self.output_queue.get_nowait()
364
+ self.output_queue.task_done()
365
+ return output_chunk
366
+
367
+ def start(self):
368
+ self.debug_log("starting transcoder in a thread")
369
+ threading.Thread(target=self.process_pipeline_loop).start()
370
+
371
+ def first_translation_time(self):
372
+ return round((self.first_output_ts - self.first_input_ts) / 1000, 2)
373
+
374
+ def get_buffered_output(self) -> SpeechAndTextOutput:
375
+ now = time.time() * 1000
376
+ self.debug_log(f"get_buffered_output queue size: {self.output_queue.qsize()}")
377
+ while not self.output_queue.empty():
378
+ tmp_out = self.get_output_segment()
379
+ if tmp_out and tmp_out.compute_length(self.g2p) > 0:
380
+ if len(self.output_buffer) == 0:
381
+ self.last_output_ts = now
382
+ self._populate_output_buffer(tmp_out)
383
+ self._increment_output_buffer_size(tmp_out)
384
+
385
+ if tmp_out.finished:
386
+ self.debug_log("tmp_out.finished")
387
+ res = self._gather_output_buffer_data(final=True)
388
+ self.debug_log(f"gathered output data: {res}")
389
+ self.output_buffer = []
390
+ self.increment_output_buffer_size = 0
391
+ self.last_output_ts = now
392
+ self.first_output_ts = now
393
+ return res
394
+ else:
395
+ self.debug_log("tmp_out.compute_length is not > 0")
396
+
397
+ if len(self.output_buffer) > 0 and (
398
+ now - self.last_output_ts >= self.output_buffer_idle_ms
399
+ or self.output_buffer_cur_size >= self.output_buffer_size_limit
400
+ ):
401
+ self.debug_log(
402
+ "[get_buffered_output] output_buffer is not empty. getting res to return."
403
+ )
404
+ self.last_output_ts = now
405
+ res = self._gather_output_buffer_data(final=False)
406
+ self.debug_log(f"gathered output data: {res}")
407
+ self.output_buffer = []
408
+ self.output_buffer_phoneme_count = 0
409
+ self.first_output_ts = now
410
+ return res
411
+ else:
412
+ self.debug_log("[get_buffered_output] output_buffer is empty...")
413
+ return None
414
+
415
+ def _gather_output_buffer_data(self, final):
416
+ output = SpeechAndTextOutput()
417
+ output.final = final
418
+ output = OutputSegments.join_output_buffer(self.output_buffer, output)
419
+ return output
420
+
421
+ def _increment_output_buffer_size(self, segment: OutputSegments):
422
+ self.output_buffer_cur_size += segment.compute_length(self.g2p)
423
+
424
+ def _populate_output_buffer(self, segment: OutputSegments):
425
+ self.output_buffer.append(segment.segments)
426
+
427
+ def _compute_phoneme_count(self, string: str) -> int:
428
+ return len([x for x in self.g2p(string) if x != " "])
backend/seamless/speech_and_text_output.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Provides a container to return both speech and text output from our model at the same time
2
+
3
+
4
+ class SpeechAndTextOutput:
5
+ def __init__(
6
+ self,
7
+ text: str = None,
8
+ speech_samples: list = None,
9
+ speech_sample_rate: float = None,
10
+ final: bool = False,
11
+ ):
12
+ self.text = text
13
+ self.speech_samples = speech_samples
14
+ self.speech_sample_rate = speech_sample_rate
15
+ self.final = final
backend/seamless/transcoder_helpers.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ logger = logging.getLogger("socketio_server_pubsub")
4
+
5
+
6
+ def get_transcoder_output_events(transcoder) -> list:
7
+ speech_and_text_output = transcoder.get_buffered_output()
8
+ if speech_and_text_output is None:
9
+ logger.debug("No output from transcoder.get_buffered_output()")
10
+ return []
11
+
12
+ logger.debug(f"We DID get output from the transcoder! {speech_and_text_output}")
13
+
14
+ lat = None
15
+
16
+ events = []
17
+
18
+ if speech_and_text_output.speech_samples:
19
+ events.append(
20
+ {
21
+ "event": "translation_speech",
22
+ "payload": speech_and_text_output.speech_samples,
23
+ "sample_rate": speech_and_text_output.speech_sample_rate,
24
+ }
25
+ )
26
+
27
+ if speech_and_text_output.text:
28
+ events.append(
29
+ {
30
+ "event": "translation_text",
31
+ "payload": speech_and_text_output.text,
32
+ }
33
+ )
34
+
35
+ for e in events:
36
+ e["eos"] = speech_and_text_output.final
37
+
38
+ # if not latency_sent:
39
+ # lat = transcoder.first_translation_time()
40
+ # latency_sent = True
41
+ # to_send["latency"] = lat
42
+
43
+ return events
backend/seamless_utils.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # base seamless imports
3
+ # ---------------------------------
4
+ import io
5
+ import json
6
+ import matplotlib as mpl
7
+ import matplotlib.pyplot as plt
8
+ import mmap
9
+ import numpy as np
10
+ import soundfile
11
+ import torchaudio
12
+ import torch
13
+ from pydub import AudioSegment
14
+ # ---------------------------------
15
+ # seamless-streaming specific imports
16
+ # ---------------------------------
17
+ import math
18
+ from simuleval.data.segments import SpeechSegment, EmptySegment
19
+ from seamless_communication.streaming.agents.seamless_streaming_s2st import (
20
+ SeamlessStreamingS2STVADAgent,
21
+ )
22
+
23
+ from simuleval.utils.arguments import cli_argument_list
24
+ from simuleval import options
25
+
26
+
27
+ from typing import Union, List
28
+ from simuleval.data.segments import Segment, TextSegment
29
+ from simuleval.agents.pipeline import TreeAgentPipeline
30
+ from simuleval.agents.states import AgentStates
31
+ # ---------------------------------
32
+ # seamless setup
33
+ # source: https://colab.research.google.com/github/kauterry/seamless_communication/blob/main/Seamless_Tutorial.ipynb?
34
+ SAMPLE_RATE = 16000
35
+
36
+ # PM - THis class is used to simulate the audio frontend in the seamless streaming pipeline
37
+ # need to replace this with the actual audio frontend
38
+ # TODO: replacement class that takes in PCM-16 bytes and returns SpeechSegment
39
+ class AudioFrontEnd:
40
+ def __init__(self, wav_file, segment_size) -> None:
41
+ self.samples, self.sample_rate = soundfile.read(wav_file)
42
+ print(self.sample_rate, "sample rate")
43
+ assert self.sample_rate == SAMPLE_RATE
44
+ # print(len(self.samples), self.samples[:100])
45
+ self.samples = self.samples # .tolist()
46
+ self.segment_size = segment_size
47
+ self.step = 0
48
+
49
+ def send_segment(self):
50
+ """
51
+ This is the front-end logic in simuleval instance.py
52
+ """
53
+
54
+ num_samples = math.ceil(self.segment_size / 1000 * self.sample_rate)
55
+
56
+ if self.step < len(self.samples):
57
+ if self.step + num_samples >= len(self.samples):
58
+ samples = self.samples[self.step :]
59
+ is_finished = True
60
+ else:
61
+ samples = self.samples[self.step : self.step + num_samples]
62
+ is_finished = False
63
+ self.samples = self.samples[self.step:]
64
+ self.step = min(self.step + num_samples, len(self.samples))
65
+ segment = SpeechSegment(
66
+ content=samples,
67
+ sample_rate=self.sample_rate,
68
+ finished=is_finished,
69
+ )
70
+ else:
71
+ # Finish reading this audio
72
+ segment = EmptySegment(
73
+ finished=True,
74
+ )
75
+ self.step = 0
76
+ self.samples = []
77
+ return segment
78
+
79
+ # samples = self.samples[:num_samples]
80
+ # self.samples = self.samples[num_samples:]
81
+ # segment = SpeechSegment(
82
+ # content=samples,
83
+ # sample_rate=self.sample_rate,
84
+ # finished=False,
85
+ # )
86
+
87
+
88
+ def add_segments(self, wav):
89
+ new_samples, _ = soundfile.read(wav)
90
+ self.samples = np.concatenate((self.samples, new_samples))
91
+
92
+
93
+ class OutputSegments:
94
+ def __init__(self, segments: Union[List[Segment], Segment]):
95
+ if isinstance(segments, Segment):
96
+ segments = [segments]
97
+ self.segments: List[Segment] = [s for s in segments]
98
+
99
+ @property
100
+ def is_empty(self):
101
+ return all(segment.is_empty for segment in self.segments)
102
+
103
+ @property
104
+ def finished(self):
105
+ return all(segment.finished for segment in self.segments)
106
+
107
+
108
+ def get_audiosegment(samples, sr):
109
+ b = io.BytesIO()
110
+ soundfile.write(b, samples, samplerate=sr, format="wav")
111
+ b.seek(0)
112
+ return AudioSegment.from_file(b)
113
+
114
+
115
+ def reset_states(system, states):
116
+ if isinstance(system, TreeAgentPipeline):
117
+ states_iter = states.values()
118
+ else:
119
+ states_iter = states
120
+ for state in states_iter:
121
+ state.reset()
122
+
123
+
124
+ def get_states_root(system, states) -> AgentStates:
125
+ if isinstance(system, TreeAgentPipeline):
126
+ # self.states is a dict
127
+ return states[system.source_module]
128
+ else:
129
+ # self.states is a list
130
+ return system.states[0]
131
+
132
+
133
+ def build_streaming_system(model_configs, agent_class):
134
+ parser = options.general_parser()
135
+ parser.add_argument("-f", "--f", help="a dummy argument to fool ipython", default="1")
136
+
137
+ agent_class.add_args(parser)
138
+ args, _ = parser.parse_known_args(cli_argument_list(model_configs))
139
+ system = agent_class.from_args(args)
140
+ return system
141
+
142
+
143
+ def run_streaming_inference(system, audio_frontend, system_states, tgt_lang):
144
+ # NOTE: Here for visualization, we calculate delays offset from audio
145
+ # *BEFORE* VAD segmentation.
146
+ # In contrast for SimulEval evaluation, we assume audios are pre-segmented,
147
+ # and Average Lagging, End Offset metrics are based on those pre-segmented audios.
148
+ # Thus, delays here are *NOT* comparable to SimulEval per-segment delays
149
+ delays = {"s2st": [], "s2tt": []}
150
+ prediction_lists = {"s2st": [], "s2tt": []}
151
+ speech_durations = []
152
+ curr_delay = 0
153
+ target_sample_rate = None
154
+
155
+ while True:
156
+ input_segment = audio_frontend.send_segment()
157
+ input_segment.tgt_lang = tgt_lang
158
+ curr_delay += len(input_segment.content) / SAMPLE_RATE * 1000
159
+ if input_segment.finished:
160
+ # a hack, we expect a real stream to end with silence
161
+ get_states_root(system, system_states).source_finished = True
162
+ # Translation happens here
163
+ if isinstance(input_segment, EmptySegment):
164
+ return None, None, None, None
165
+ output_segments = OutputSegments(system.pushpop(input_segment, system_states))
166
+ if not output_segments.is_empty:
167
+ for segment in output_segments.segments:
168
+ # NOTE: another difference from SimulEval evaluation -
169
+ # delays are accumulated per-token
170
+ if isinstance(segment, SpeechSegment):
171
+ pred_duration = 1000 * len(segment.content) / segment.sample_rate
172
+ speech_durations.append(pred_duration)
173
+ delays["s2st"].append(curr_delay)
174
+ prediction_lists["s2st"].append(segment.content)
175
+ target_sample_rate = segment.sample_rate
176
+ elif isinstance(segment, TextSegment):
177
+ delays["s2tt"].append(curr_delay)
178
+ prediction_lists["s2tt"].append(segment.content)
179
+ print(curr_delay, segment.content)
180
+ if output_segments.finished:
181
+ reset_states(system, system_states)
182
+ if input_segment.finished:
183
+ # an assumption of SimulEval agents -
184
+ # once source_finished=True, generate until output translation is finished
185
+ break
186
+ return delays, prediction_lists, speech_durations, target_sample_rate
187
+
188
+
189
+ def get_s2st_delayed_targets(delays, target_sample_rate, prediction_lists, speech_durations):
190
+ # get calculate intervals + durations for s2st
191
+ intervals = []
192
+
193
+ start = prev_end = prediction_offset = delays["s2st"][0]
194
+ target_samples = [0.0] * int(target_sample_rate * prediction_offset / 1000)
195
+
196
+ for i, delay in enumerate(delays["s2st"]):
197
+ start = max(prev_end, delay)
198
+
199
+ if start > prev_end:
200
+ # Wait source speech, add discontinuity with silence
201
+ target_samples += [0.0] * int(
202
+ target_sample_rate * (start - prev_end) / 1000
203
+ )
204
+
205
+ target_samples += prediction_lists["s2st"][i]
206
+ duration = speech_durations[i]
207
+ prev_end = start + duration
208
+ intervals.append([start, duration])
209
+ return target_samples, intervals
210
+
backend/src/models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (243 Bytes). View file
 
backend/src/testing/test.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import sys
2
+ sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
3
+
4
+ import utility
backend/src/utility/__pycache__/utility.cpython-310.pyc ADDED
Binary file (202 Bytes). View file