reach-vb HF staff ylacombe HF staff commited on
Commit
3d59a60
1 Parent(s): c111ea2

hf_implementation (#23)

Browse files

- update with HF implementation (b3882fafaf5d0c32dd9b458e7efbcac2469293a1)


Co-authored-by: Yoach Lacombe <[email protected]>

Files changed (5) hide show
  1. Dockerfile +0 -56
  2. README.md +3 -2
  3. app.py +23 -19
  4. lang_list.py +148 -0
  5. requirements.txt +2 -5
Dockerfile DELETED
@@ -1,56 +0,0 @@
1
- FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
2
- ENV DEBIAN_FRONTEND=noninteractive
3
- RUN apt-get update && \
4
- apt-get upgrade -y && \
5
- apt-get install -y --no-install-recommends \
6
- git \
7
- git-lfs \
8
- wget \
9
- curl \
10
- # python build dependencies \
11
- build-essential \
12
- libssl-dev \
13
- zlib1g-dev \
14
- libbz2-dev \
15
- libreadline-dev \
16
- libsqlite3-dev \
17
- libncursesw5-dev \
18
- xz-utils \
19
- tk-dev \
20
- libxml2-dev \
21
- libxmlsec1-dev \
22
- libffi-dev \
23
- liblzma-dev \
24
- # gradio dependencies \
25
- ffmpeg \
26
- # fairseq2 dependencies \
27
- libsndfile-dev && \
28
- apt-get clean && \
29
- rm -rf /var/lib/apt/lists/*
30
-
31
- RUN useradd -m -u 1000 user
32
- USER user
33
- ENV HOME=/home/user \
34
- PATH=/home/user/.local/bin:${PATH}
35
- WORKDIR ${HOME}/app
36
-
37
- RUN curl https://pyenv.run | bash
38
- ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
39
- ARG PYTHON_VERSION=3.10.12
40
- RUN pyenv install ${PYTHON_VERSION} && \
41
- pyenv global ${PYTHON_VERSION} && \
42
- pyenv rehash && \
43
- pip install --no-cache-dir -U pip setuptools wheel
44
-
45
- COPY --chown=1000 ./requirements.txt /tmp/requirements.txt
46
- RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
47
-
48
- COPY --chown=1000 . ${HOME}/app
49
- ENV PYTHONPATH=${HOME}/app \
50
- PYTHONUNBUFFERED=1 \
51
- GRADIO_ALLOW_FLAGGING=never \
52
- GRADIO_NUM_PORTS=1 \
53
- GRADIO_SERVER_NAME=0.0.0.0 \
54
- GRADIO_THEME=huggingface \
55
- SYSTEM=spaces
56
- CMD ["python", "app.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -3,9 +3,10 @@ title: Seamless M4T
3
  emoji: 📞
4
  colorFrom: blue
5
  colorTo: yellow
6
- sdk: docker
 
7
  pinned: false
8
  suggested_hardware: t4-medium
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
3
  emoji: 📞
4
  colorFrom: blue
5
  colorTo: yellow
6
+ sdk: gradio
7
+ app_file: app.py
8
  pinned: false
9
  suggested_hardware: t4-medium
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -6,7 +6,7 @@ import gradio as gr
6
  import numpy as np
7
  import torch
8
  import torchaudio
9
- from seamless_communication.models.inference.translator import Translator
10
 
11
  from lang_list import (
12
  LANGUAGE_NAME_TO_CODE,
@@ -14,13 +14,12 @@ from lang_list import (
14
  S2TT_TARGET_LANGUAGE_NAMES,
15
  T2TT_TARGET_LANGUAGE_NAMES,
16
  TEXT_SOURCE_LANGUAGE_NAMES,
 
17
  )
18
 
19
  DESCRIPTION = """# SeamlessM4T
20
-
21
  [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
22
  translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
23
-
24
  This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
25
  translation and more, without relying on multiple separate models.
26
  """
@@ -39,11 +38,9 @@ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
39
  DEFAULT_TARGET_LANGUAGE = "French"
40
 
41
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
42
- translator = Translator(
43
- model_name_or_card="seamlessM4T_large",
44
- vocoder_name_or_card="vocoder_36langs",
45
- device=device,
46
- )
47
 
48
 
49
  def predict(
@@ -71,18 +68,25 @@ def predict(
71
  if new_arr.shape[1] > max_length:
72
  new_arr = new_arr[:, :max_length]
73
  gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
74
- torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
 
 
75
  else:
76
- input_data = input_text
77
- text_out, wav, sr = translator.predict(
78
- input=input_data,
79
- task_str=task_name,
80
- tgt_lang=target_language_code,
81
- src_lang=source_language_code,
82
- ngram_filtering=True,
83
- )
 
 
 
 
 
84
  if task_name in ["S2ST", "T2ST"]:
85
- return (sr, wav.cpu().detach().numpy()), text_out
86
  else:
87
  return None, text_out
88
 
@@ -430,4 +434,4 @@ demo.queue(max_size=50).launch()
430
 
431
  # Linking models to the space
432
  # 'facebook/seamless-m4t-large'
433
- # 'facebook/SONAR'
 
6
  import numpy as np
7
  import torch
8
  import torchaudio
9
+ from transformers import AutoProcessor, SeamlessM4TModel
10
 
11
  from lang_list import (
12
  LANGUAGE_NAME_TO_CODE,
 
14
  S2TT_TARGET_LANGUAGE_NAMES,
15
  T2TT_TARGET_LANGUAGE_NAMES,
16
  TEXT_SOURCE_LANGUAGE_NAMES,
17
+ LANG_TO_SPKR_ID,
18
  )
19
 
20
  DESCRIPTION = """# SeamlessM4T
 
21
  [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
22
  translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
 
23
  This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
24
  translation and more, without relying on multiple separate models.
25
  """
 
38
  DEFAULT_TARGET_LANGUAGE = "French"
39
 
40
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
41
+
42
+ processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
43
+ model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
 
 
44
 
45
 
46
  def predict(
 
68
  if new_arr.shape[1] > max_length:
69
  new_arr = new_arr[:, :max_length]
70
  gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
71
+
72
+
73
+ input_data = processor(audios = new_arr, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device)
74
  else:
75
+ input_data = processor(text = input_text, src_lang=source_language_code, return_tensors="pt").to(device)
76
+
77
+
78
+ if task_name in ["S2TT", "T2TT"]:
79
+ tokens_ids = model.generate(**input_data, generate_speech=False, tgt_lang=target_language_code, num_beams=5, do_sample=True)[0].cpu().squeeze().detach().tolist()
80
+ else:
81
+ output = model.generate(**input_data, return_intermediate_token_ids=True, tgt_lang=target_language_code, num_beams=5, do_sample=True, spkr_id=LANG_TO_SPKR_ID[target_language_code][0])
82
+
83
+ waveform = output.waveform.cpu().squeeze().detach().numpy()
84
+ tokens_ids = output.sequences.cpu().squeeze().detach().tolist()
85
+
86
+ text_out = processor.decode(tokens_ids, skip_special_tokens=True)
87
+
88
  if task_name in ["S2ST", "T2ST"]:
89
+ return (AUDIO_SAMPLE_RATE, waveform), text_out
90
  else:
91
  return None, text_out
92
 
 
434
 
435
  # Linking models to the space
436
  # 'facebook/seamless-m4t-large'
437
+ # 'facebook/SONAR'
lang_list.py CHANGED
@@ -252,3 +252,151 @@ S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2s
252
  S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
253
  # T2TT
254
  T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
253
  # T2TT
254
  T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
255
+
256
+
257
+ LANG_TO_SPKR_ID = {
258
+ "arb": [
259
+ 0
260
+ ],
261
+ "ben": [
262
+ 2,
263
+ 1
264
+ ],
265
+ "cat": [
266
+ 3
267
+ ],
268
+ "ces": [
269
+ 4
270
+ ],
271
+ "cmn": [
272
+ 5
273
+ ],
274
+ "cym": [
275
+ 6
276
+ ],
277
+ "dan": [
278
+ 7,
279
+ 8
280
+ ],
281
+ "deu": [
282
+ 9
283
+ ],
284
+ "eng": [
285
+ 10
286
+ ],
287
+ "est": [
288
+ 11,
289
+ 12,
290
+ 13
291
+ ],
292
+ "fin": [
293
+ 14
294
+ ],
295
+ "fra": [
296
+ 15
297
+ ],
298
+ "hin": [
299
+ 16
300
+ ],
301
+ "ind": [
302
+ 17,
303
+ 24,
304
+ 18,
305
+ 20,
306
+ 19,
307
+ 21,
308
+ 23,
309
+ 27,
310
+ 26,
311
+ 22,
312
+ 25
313
+ ],
314
+ "ita": [
315
+ 29,
316
+ 28
317
+ ],
318
+ "jpn": [
319
+ 30
320
+ ],
321
+ "kor": [
322
+ 31
323
+ ],
324
+ "mlt": [
325
+ 32,
326
+ 33,
327
+ 34
328
+ ],
329
+ "nld": [
330
+ 35
331
+ ],
332
+ "pes": [
333
+ 36
334
+ ],
335
+ "pol": [
336
+ 37
337
+ ],
338
+ "por": [
339
+ 38
340
+ ],
341
+ "ron": [
342
+ 39
343
+ ],
344
+ "rus": [
345
+ 40
346
+ ],
347
+ "slk": [
348
+ 41
349
+ ],
350
+ "spa": [
351
+ 42
352
+ ],
353
+ "swe": [
354
+ 43,
355
+ 45,
356
+ 44
357
+ ],
358
+ "swh": [
359
+ 46,
360
+ 48,
361
+ 47
362
+ ],
363
+ "tel": [
364
+ 49
365
+ ],
366
+ "tgl": [
367
+ 50
368
+ ],
369
+ "tha": [
370
+ 51,
371
+ 54,
372
+ 55,
373
+ 52,
374
+ 53
375
+ ],
376
+ "tur": [
377
+ 58,
378
+ 57,
379
+ 56
380
+ ],
381
+ "ukr": [
382
+ 59
383
+ ],
384
+ "urd": [
385
+ 60,
386
+ 61,
387
+ 62
388
+ ],
389
+ "uzn": [
390
+ 63,
391
+ 64,
392
+ 65
393
+ ],
394
+ "vie": [
395
+ 66,
396
+ 67,
397
+ 70,
398
+ 71,
399
+ 68,
400
+ 69
401
+ ]
402
+ }
requirements.txt CHANGED
@@ -1,6 +1,3 @@
1
- fairseq2==0.1.0
2
- git+https://github.com/facebookresearch/seamless_communication
3
- gradio==3.40.1
4
- huggingface_hub==0.16.4
5
- torch==2.0.1
6
  torchaudio==2.0.2
 
 
1
+ git+https://github.com/huggingface/transformers
 
 
 
 
2
  torchaudio==2.0.2
3
+ sentencepiece