Yurii Paniv commited on
Commit
9ce0232
1 Parent(s): e96206b

Add end-to-end speaking demo

Browse files
Files changed (4) hide show
  1. README.md +8 -10
  2. app.py +44 -36
  3. gpt2-uk-conversational +1 -0
  4. requirements.txt +2 -1
README.md CHANGED
@@ -13,19 +13,17 @@ This is a pet project with aim to provide an end-to-end voice chatbot with abili
13
 
14
  It's a project with an aim to demonstrate current state-of-the-art speech technologies for Ukrainian language.
15
 
 
 
16
  # Technologies used:
17
 
18
- - [ ] [Wav2Vec2 XLS-R 300M fine-tuned to Ukrainian language. WER: `31.56%`](https://huggingface.co/robinhad/wav2vec2-xls-r-300m-uk) for speech recognition.
19
- - [ ] [GlowTTS, trained on M-AILABS dataset](https://github.com/robinhad/ukrainian-tts).
20
- - [ ] Conversational pipeline (TBD)
21
 
22
- # Features
23
-
24
- - [ ] Ability to change backends
25
- - [ ] Support for Google Text-to-Speech/Speech-to-Text
26
- - [ ] Echo backend (speaks recognized phrase)
27
- - [ ] Other models
28
 
29
  # How to setup:
30
 
31
- TBD
 
 
13
 
14
  It's a project with an aim to demonstrate current state-of-the-art speech technologies for Ukrainian language.
15
 
16
+ Link to speaking demo: [https://huggingface.co/spaces/robinhad/ukrainian-ai](https://huggingface.co/spaces/robinhad/ukrainian-ai)
17
+ Link to text demo: [https://huggingface.co/robinhad/gpt2-uk-conversational](https://huggingface.co/robinhad/gpt2-uk-conversational)
18
  # Technologies used:
19
 
20
+ - [Wav2Vec2 XLS-R 300M fine-tuned to Ukrainian language](https://huggingface.co/Yehor/wav2vec2-xls-r-300m-uk-with-small-lm) for speech recognition.
21
+ - [Ukrainian VITS TTS](https://github.com/robinhad/ukrainian-tts) for text-to-speech generation.
22
+ - Conversational pipeline (this repository)
23
 
24
+ TODO: training scripts for conversational pipeline
 
 
 
 
 
25
 
26
  # How to setup:
27
 
28
+ 1. `pip install -r requirements.txt`
29
+ 2. `python app.py`
app.py CHANGED
@@ -1,6 +1,5 @@
1
- import random
2
  import gradio as gr
3
- from transformers import pipeline
4
  import tempfile
5
  import torch
6
  from os.path import exists
@@ -12,7 +11,7 @@ def download(url, file_name):
12
  if not exists(file_name):
13
  print(f"Downloading {file_name}")
14
  r = requests.get(url, allow_redirects=True)
15
- with open(file_name, 'wb') as file:
16
  file.write(r.content)
17
  else:
18
  print(f"Found {file_name}. Skipping download...")
@@ -29,47 +28,56 @@ config_path = "config.json"
29
  download(model_link, model_path)
30
  download(config_link, config_path)
31
 
32
- p = pipeline("automatic-speech-recognition", "Yehor/wav2vec2-xls-r-300m-uk-with-small-lm")
 
 
 
 
 
 
33
 
34
  synthesizer = Synthesizer(
35
- model_path, config_path, None, None, None,
 
 
 
 
 
 
 
 
36
  )
37
 
38
- badge = "https://visitor-badge-reloaded.herokuapp.com/badge?page_id=robinhad.ukrainian-ai"
39
 
40
- def transcribe(audio):
41
  text = p(audio)["text"]
 
 
 
 
 
 
42
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
43
  with torch.no_grad():
44
- wavs = synthesizer.tts(text)
45
  synthesizer.save_wav(wavs, fp)
46
- return text, fp.name
47
-
48
- gr.Interface(
49
- fn=transcribe,
50
- inputs=gr.inputs.Audio(source="microphone", type="filepath"),
51
- outputs=[gr.outputs.Textbox(label="Recognized text"),gr.outputs.Audio(label="Output")],
52
- article=f"<center><img src=\"{badge}\" alt=\"visitors badge\"/></center>",).launch()
53
 
54
- def chat(message, history):
55
- history = history or []
56
- #if message.startswith("How many"):
57
- # response = random.randint(1, 10)
58
- #elif message.startswith("How"):
59
- # response = random.choice(["Great", "Good", "Okay", "Bad"])
60
- #elif message.startswith("Where"):
61
- # response = random.choice(["Here", "There", "Somewhere"])
62
- #else:
63
- # response = "I don't know"
64
- #history.append((message, response))
65
- return history, history
66
 
67
-
68
- #iface = gr.Interface(
69
- # chat,
70
- # ["audio", "state"],
71
- # ["chatbot", "state"],
72
- # allow_screenshot=False,
73
- # allow_flagging="never",
74
- #)
75
- #iface.launch()
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import Conversation, ConversationalPipeline, pipeline
3
  import tempfile
4
  import torch
5
  from os.path import exists
 
11
  if not exists(file_name):
12
  print(f"Downloading {file_name}")
13
  r = requests.get(url, allow_redirects=True)
14
+ with open(file_name, "wb") as file:
15
  file.write(r.content)
16
  else:
17
  print(f"Found {file_name}. Skipping download...")
 
28
  download(model_link, model_path)
29
  download(config_link, config_path)
30
 
31
+ p = pipeline(
32
+ "automatic-speech-recognition", "Yehor/wav2vec2-xls-r-300m-uk-with-small-lm"
33
+ )
34
+
35
+ conv: ConversationalPipeline = pipeline(
36
+ "conversational", "robinhad/gpt2-uk-conversational", use_auth_token=True
37
+ )
38
 
39
  synthesizer = Synthesizer(
40
+ model_path,
41
+ config_path,
42
+ None,
43
+ None,
44
+ None,
45
+ )
46
+
47
+ badge = (
48
+ "https://visitor-badge-reloaded.herokuapp.com/badge?page_id=robinhad.ukrainian-ai"
49
  )
50
 
 
51
 
52
+ def transcribe(audio, history):
53
  text = p(audio)["text"]
54
+ history = history or []
55
+ past_user_inputs = [i[0] for i in history]
56
+ generated_responses = [i[1] for i in history]
57
+ response = conv(Conversation(text, past_user_inputs, generated_responses))
58
+ response = response.generated_responses[-1]
59
+ history.append((text, response))
60
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
61
  with torch.no_grad():
62
+ wavs = synthesizer.tts(response)
63
  synthesizer.save_wav(wavs, fp)
64
+ return text, fp.name, history, history
 
 
 
 
 
 
65
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ iface = gr.Interface(
68
+ fn=transcribe,
69
+ inputs=[gr.inputs.Audio(source="microphone", type="filepath"), "state"],
70
+ outputs=[
71
+ gr.outputs.Textbox(label="Recognized text"),
72
+ gr.outputs.Audio(label="Output"),
73
+ gr.outputs.Chatbot(label="Chat"),
74
+ "state",
75
+ ],
76
+ description="""Це альфа-версія end-to-end розмовного бота, з яким можна поспілкуватися голосом.
77
+ Перейдіть сюди для доступу до текстової версії: [https://huggingface.co/robinhad/gpt2-uk-conversational](https://huggingface.co/robinhad/gpt2-uk-conversational)
78
+ """,
79
+ article=f"""Розпізнавання української: [https://huggingface.co/Yehor/wav2vec2-xls-r-300m-uk-with-small-lm](https://huggingface.co/Yehor/wav2vec2-xls-r-300m-uk-with-small-lm)
80
+ Синтез української: [https://huggingface.co/spaces/robinhad/ukrainian-tts](https://huggingface.co/spaces/robinhad/ukrainian-tts)
81
+ <center><img src="{badge}" alt="visitors badge"/></center>""",
82
+ )
83
+ iface.launch()
gpt2-uk-conversational ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit c70fdb543d8bf0509e5787ce3a7e768ef52e6991
requirements.txt CHANGED
@@ -3,4 +3,5 @@ transformers==4.19.4
3
  TTS==0.6.2
4
  torch
5
  pyctcdecode
6
- https://github.com/kpu/kenlm/archive/master.zip
 
 
3
  TTS==0.6.2
4
  torch
5
  pyctcdecode
6
+ https://github.com/kpu/kenlm/archive/master.zip
7
+ sentencepiece==0.1.96