Anna Sun
commited on
Commit
•
1143e8d
1
Parent(s):
070b677
add dual non-expr/expressive agent, install sc from github
Browse files- Dockerfile +5 -0
- seamless_server/models/Seamless/vad_s2st_sc_24khz_main.yaml +25 -0
- seamless_server/models/SeamlessStreaming/{vad_s2st_sc_24khz_main.yaml → vad_s2st_sc_main.yaml} +0 -0
- seamless_server/requirements.txt +2 -1
- seamless_server/src/simuleval_agent_directory.py +29 -8
- seamless_server/src/simuleval_transcoder.py +7 -2
- seamless_server/whl/seamless_communication-1.0.0-py3-none-any.whl +2 -2
- streaming-react-app/src/StreamingInterface.tsx +27 -0
- streaming-react-app/src/types/StreamingTypes.ts +1 -0
Dockerfile
CHANGED
@@ -71,6 +71,11 @@ RUN cd seamless_server && \
|
|
71 |
COPY --from=frontend /app/dist ./streaming-react-app/dist
|
72 |
|
73 |
WORKDIR $HOME/app/seamless_server
|
|
|
|
|
|
|
|
|
|
|
74 |
USER root
|
75 |
RUN ln -s /usr/lib/x86_64-linux-gnu/libsox.so.3 /usr/lib/x86_64-linux-gnu/libsox.so
|
76 |
USER user
|
|
|
71 |
COPY --from=frontend /app/dist ./streaming-react-app/dist
|
72 |
|
73 |
WORKDIR $HOME/app/seamless_server
|
74 |
+
RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true \
|
75 |
+
huggingface-cli login --token $(cat /run/secrets/HF_TOKEN) && \
|
76 |
+
huggingface-cli download meta-private/SeamlessExpressive pretssel_melhifigan_wm-final.pt --local-dir ./models/Seamless/ && \
|
77 |
+
ln -s $(readlink -f models/Seamless/pretssel_melhifigan_wm-final.pt) models/Seamless/pretssel_melhifigan_wm.pt
|
78 |
+
|
79 |
USER root
|
80 |
RUN ln -s /usr/lib/x86_64-linux-gnu/libsox.so.3 /usr/lib/x86_64-linux-gnu/libsox.so
|
81 |
USER user
|
seamless_server/models/Seamless/vad_s2st_sc_24khz_main.yaml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
agent_class: seamless_communication.streaming.agents.seamless_s2st.SeamlessS2STDualVocoderVADAgent
|
2 |
+
monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
|
3 |
+
unity_model_name: seamless_streaming_unity
|
4 |
+
sentencepiece_model: spm_256k_nllb100.model
|
5 |
+
|
6 |
+
task: s2st
|
7 |
+
tgt_lang: "eng"
|
8 |
+
min_unit_chunk_size: 50
|
9 |
+
decision_threshold: 0.7
|
10 |
+
no_early_stop: True
|
11 |
+
block_ngrams: True
|
12 |
+
vocoder_name: vocoder_v2
|
13 |
+
expr_vocoder_name: vocoder_pretssel
|
14 |
+
gated_model_dir: .
|
15 |
+
expr_vocoder_gain: 3.0
|
16 |
+
upstream_idx: 1
|
17 |
+
wav2vec_yaml: wav2vec.yaml
|
18 |
+
min_starting_wait_w2vbert: 192
|
19 |
+
|
20 |
+
config_yaml: cfg_fbank_u2t.yaml
|
21 |
+
upstream_idx: 1
|
22 |
+
detokenize_only: True
|
23 |
+
device: cuda:0
|
24 |
+
max_len_a: 0
|
25 |
+
max_len_b: 1000
|
seamless_server/models/SeamlessStreaming/{vad_s2st_sc_24khz_main.yaml → vad_s2st_sc_main.yaml}
RENAMED
File without changes
|
seamless_server/requirements.txt
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# seamless_communication
|
2 |
-
|
|
|
3 |
Flask==2.1.3
|
4 |
Flask_Sockets==0.2.1
|
5 |
g2p_en==2.1.0
|
|
|
1 |
# seamless_communication
|
2 |
+
git+https://github.com/facebookresearch/seamless_communication.git
|
3 |
+
# ./whl/seamless_communication-1.0.0-py3-none-any.whl
|
4 |
Flask==2.1.3
|
5 |
Flask_Sockets==0.2.1
|
6 |
g2p_en==2.1.0
|
seamless_server/src/simuleval_agent_directory.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
# Creates a directory in which to look up available agents
|
2 |
|
3 |
-
|
|
|
4 |
from src.simuleval_transcoder import SimulevalTranscoder
|
5 |
import json
|
6 |
import logging
|
@@ -33,8 +34,10 @@ class AgentWithInfo:
|
|
33 |
# Supported dynamic params are defined in StreamingTypes.ts
|
34 |
dynamic_params: List[str] = [],
|
35 |
description="",
|
|
|
36 |
):
|
37 |
self.agent = agent
|
|
|
38 |
self.name = name
|
39 |
self.description = description
|
40 |
self.modalities = modalities
|
@@ -75,6 +78,7 @@ class AgentWithInfo:
|
|
75 |
class SimulevalAgentDirectory:
|
76 |
# Available models. These are the directories where the models can be found, and also serve as an ID for the model.
|
77 |
seamless_streaming_agent = "SeamlessStreaming"
|
|
|
78 |
|
79 |
def __init__(self):
|
80 |
self.agents = []
|
@@ -96,7 +100,12 @@ class SimulevalAgentDirectory:
|
|
96 |
model_id,
|
97 |
)
|
98 |
except Exception as e:
|
|
|
99 |
logger.warning("Failed to build agent %s: %s" % (model_id, e))
|
|
|
|
|
|
|
|
|
100 |
raise e
|
101 |
|
102 |
return agent
|
@@ -110,20 +119,32 @@ class SimulevalAgentDirectory:
|
|
110 |
for agent_info in agent_infos:
|
111 |
self.add_agent(agent_info)
|
112 |
else:
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
-
if
|
119 |
self.add_agent(
|
120 |
AgentWithInfo(
|
121 |
-
agent=
|
122 |
name=SimulevalAgentDirectory.seamless_streaming_agent,
|
123 |
modalities=["s2t", "s2s"],
|
124 |
target_langs=M4T_P0_LANGS,
|
125 |
dynamic_params=["expressive"],
|
126 |
description="multilingual expressive model that supports S2S and S2T",
|
|
|
127 |
)
|
128 |
)
|
129 |
|
@@ -137,7 +158,7 @@ class SimulevalAgentDirectory:
|
|
137 |
def get_agent(self, name):
|
138 |
for agent in self.agents:
|
139 |
if agent.name == name:
|
140 |
-
return agent
|
141 |
return None
|
142 |
|
143 |
def get_agent_or_throw(self, name):
|
|
|
1 |
# Creates a directory in which to look up available agents
|
2 |
|
3 |
+
import os
|
4 |
+
from typing import List, Optional
|
5 |
from src.simuleval_transcoder import SimulevalTranscoder
|
6 |
import json
|
7 |
import logging
|
|
|
34 |
# Supported dynamic params are defined in StreamingTypes.ts
|
35 |
dynamic_params: List[str] = [],
|
36 |
description="",
|
37 |
+
has_expressive: Optional[bool] = None,
|
38 |
):
|
39 |
self.agent = agent
|
40 |
+
self.has_expressive = has_expressive
|
41 |
self.name = name
|
42 |
self.description = description
|
43 |
self.modalities = modalities
|
|
|
78 |
class SimulevalAgentDirectory:
|
79 |
# Available models. These are the directories where the models can be found, and also serve as an ID for the model.
|
80 |
seamless_streaming_agent = "SeamlessStreaming"
|
81 |
+
seamless_agent = "Seamless"
|
82 |
|
83 |
def __init__(self):
|
84 |
self.agents = []
|
|
|
100 |
model_id,
|
101 |
)
|
102 |
except Exception as e:
|
103 |
+
from fairseq2.assets.error import AssetError
|
104 |
logger.warning("Failed to build agent %s: %s" % (model_id, e))
|
105 |
+
if isinstance(e, AssetError):
|
106 |
+
logger.warning(
|
107 |
+
"Please download gated assets and set `gated_model_dir` in the config"
|
108 |
+
)
|
109 |
raise e
|
110 |
|
111 |
return agent
|
|
|
119 |
for agent_info in agent_infos:
|
120 |
self.add_agent(agent_info)
|
121 |
else:
|
122 |
+
s2s_agent = None
|
123 |
+
if os.environ.get("USE_EXPRESSIVE_MODEL"):
|
124 |
+
logger.info("Building expressive model...")
|
125 |
+
s2s_agent = self.build_agent_if_available(
|
126 |
+
SimulevalAgentDirectory.seamless_agent,
|
127 |
+
config_name="vad_s2st_sc_24khz_main.yaml",
|
128 |
+
)
|
129 |
+
has_expressive = True
|
130 |
+
else:
|
131 |
+
logger.info("Building non-expressive model...")
|
132 |
+
s2s_agent = self.build_agent_if_available(
|
133 |
+
SimulevalAgentDirectory.seamless_streaming_agent,
|
134 |
+
config_name="vad_s2st_sc_main.yaml",
|
135 |
+
)
|
136 |
+
has_expressive = False
|
137 |
|
138 |
+
if s2s_agent:
|
139 |
self.add_agent(
|
140 |
AgentWithInfo(
|
141 |
+
agent=s2s_agent,
|
142 |
name=SimulevalAgentDirectory.seamless_streaming_agent,
|
143 |
modalities=["s2t", "s2s"],
|
144 |
target_langs=M4T_P0_LANGS,
|
145 |
dynamic_params=["expressive"],
|
146 |
description="multilingual expressive model that supports S2S and S2T",
|
147 |
+
has_expressive=has_expressive,
|
148 |
)
|
149 |
)
|
150 |
|
|
|
158 |
def get_agent(self, name):
|
159 |
for agent in self.agents:
|
160 |
if agent.name == name:
|
161 |
+
return agent
|
162 |
return None
|
163 |
|
164 |
def get_agent_or_throw(self, name):
|
seamless_server/src/simuleval_transcoder.py
CHANGED
@@ -119,7 +119,8 @@ class OutputSegments:
|
|
119 |
|
120 |
class SimulevalTranscoder:
|
121 |
def __init__(self, agent, sample_rate, debug, buffer_limit):
|
122 |
-
self.agent = agent
|
|
|
123 |
self.input_queue = asyncio.Queue()
|
124 |
self.output_queue = asyncio.Queue()
|
125 |
self.states = self.agent.build_states()
|
@@ -185,7 +186,7 @@ class SimulevalTranscoder:
|
|
185 |
logger.info(*args)
|
186 |
|
187 |
@classmethod
|
188 |
-
def build_agent(cls, model_path, config_name
|
189 |
logger.info(f"Building simuleval agent: {model_path}, {config_name}")
|
190 |
agent = build_system_from_dir(
|
191 |
Path(__file__).resolve().parent.parent / f"models/{model_path}",
|
@@ -208,6 +209,10 @@ class SimulevalTranscoder:
|
|
208 |
tgt_lang=dynamic_config.get("targetLanguage"),
|
209 |
config=dynamic_config,
|
210 |
)
|
|
|
|
|
|
|
|
|
211 |
# # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
|
212 |
self.input_queue.put_nowait(segment)
|
213 |
|
|
|
119 |
|
120 |
class SimulevalTranscoder:
|
121 |
def __init__(self, agent, sample_rate, debug, buffer_limit):
|
122 |
+
self.agent = agent.agent
|
123 |
+
self.has_expressive = agent.has_expressive
|
124 |
self.input_queue = asyncio.Queue()
|
125 |
self.output_queue = asyncio.Queue()
|
126 |
self.states = self.agent.build_states()
|
|
|
186 |
logger.info(*args)
|
187 |
|
188 |
@classmethod
|
189 |
+
def build_agent(cls, model_path, config_name):
|
190 |
logger.info(f"Building simuleval agent: {model_path}, {config_name}")
|
191 |
agent = build_system_from_dir(
|
192 |
Path(__file__).resolve().parent.parent / f"models/{model_path}",
|
|
|
209 |
tgt_lang=dynamic_config.get("targetLanguage"),
|
210 |
config=dynamic_config,
|
211 |
)
|
212 |
+
if dynamic_config.get("expressive") is True and self.has_expressive is False:
|
213 |
+
logger.warning(
|
214 |
+
"Passing 'expressive' but the agent does not support expressive output!"
|
215 |
+
)
|
216 |
# # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
|
217 |
self.input_queue.put_nowait(segment)
|
218 |
|
seamless_server/whl/seamless_communication-1.0.0-py3-none-any.whl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d5b81add4d9917ac562c2e8a10bd5b3c88804b8bd94c56cef4e9a01ecde4a839
|
3 |
+
size 204321
|
streaming-react-app/src/StreamingInterface.tsx
CHANGED
@@ -165,6 +165,9 @@ export default function StreamingInterface() {
|
|
165 |
|
166 |
// Dynamic Params:
|
167 |
const [targetLang, setTargetLang] = useState<string | null>(null);
|
|
|
|
|
|
|
168 |
|
169 |
const [serverDebugFlag, setServerDebugFlag] = useState<boolean>(
|
170 |
debugParam ?? false,
|
@@ -246,6 +249,7 @@ export default function StreamingInterface() {
|
|
246 |
setAgent((prevAgent) => {
|
247 |
if (prevAgent?.name !== newAgent?.name) {
|
248 |
setTargetLang(newAgent?.targetLangs[0] ?? null);
|
|
|
249 |
}
|
250 |
return newAgent;
|
251 |
});
|
@@ -421,6 +425,7 @@ export default function StreamingInterface() {
|
|
421 |
// available before actually configuring and starting the stream
|
422 |
const fullDynamicConfig: DynamicConfig = {
|
423 |
targetLanguage: targetLang,
|
|
|
424 |
};
|
425 |
|
426 |
await onSetDynamicConfig(fullDynamicConfig);
|
@@ -906,6 +911,28 @@ export default function StreamingInterface() {
|
|
906 |
spacing={1}
|
907 |
alignItems="flex-start"
|
908 |
sx={{flexGrow: 1}}>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
909 |
{isListener && (
|
910 |
<Box
|
911 |
sx={{
|
|
|
165 |
|
166 |
// Dynamic Params:
|
167 |
const [targetLang, setTargetLang] = useState<string | null>(null);
|
168 |
+
const [enableExpressive, setEnableExpressive] = useState<boolean | null>(
|
169 |
+
null,
|
170 |
+
);
|
171 |
|
172 |
const [serverDebugFlag, setServerDebugFlag] = useState<boolean>(
|
173 |
debugParam ?? false,
|
|
|
249 |
setAgent((prevAgent) => {
|
250 |
if (prevAgent?.name !== newAgent?.name) {
|
251 |
setTargetLang(newAgent?.targetLangs[0] ?? null);
|
252 |
+
setEnableExpressive(null);
|
253 |
}
|
254 |
return newAgent;
|
255 |
});
|
|
|
425 |
// available before actually configuring and starting the stream
|
426 |
const fullDynamicConfig: DynamicConfig = {
|
427 |
targetLanguage: targetLang,
|
428 |
+
expressive: enableExpressive,
|
429 |
};
|
430 |
|
431 |
await onSetDynamicConfig(fullDynamicConfig);
|
|
|
911 |
spacing={1}
|
912 |
alignItems="flex-start"
|
913 |
sx={{flexGrow: 1}}>
|
914 |
+
{currentAgent?.dynamicParams?.includes(
|
915 |
+
'expressive',
|
916 |
+
) && (
|
917 |
+
<FormControlLabel
|
918 |
+
control={
|
919 |
+
<Switch
|
920 |
+
checked={enableExpressive ?? false}
|
921 |
+
onChange={(
|
922 |
+
event: React.ChangeEvent<HTMLInputElement>,
|
923 |
+
) => {
|
924 |
+
const newValue = event.target.checked;
|
925 |
+
setEnableExpressive(newValue);
|
926 |
+
onSetDynamicConfig({
|
927 |
+
expressive: newValue,
|
928 |
+
});
|
929 |
+
}}
|
930 |
+
/>
|
931 |
+
}
|
932 |
+
label="Expressive"
|
933 |
+
/>
|
934 |
+
)}
|
935 |
+
|
936 |
{isListener && (
|
937 |
<Box
|
938 |
sx={{
|
streaming-react-app/src/types/StreamingTypes.ts
CHANGED
@@ -113,6 +113,7 @@ export type TranslationSentences = Array<string>;
|
|
113 |
export type DynamicConfig = {
|
114 |
// targetLanguage: a 3-letter string representing the desired output language.
|
115 |
targetLanguage: string;
|
|
|
116 |
};
|
117 |
|
118 |
export type PartialDynamicConfig = Partial<DynamicConfig>;
|
|
|
113 |
export type DynamicConfig = {
|
114 |
// targetLanguage: a 3-letter string representing the desired output language.
|
115 |
targetLanguage: string;
|
116 |
+
expressive: boolean | null;
|
117 |
};
|
118 |
|
119 |
export type PartialDynamicConfig = Partial<DynamicConfig>;
|