Spaces:
Configuration error
Configuration error
Yurii Paniv
commited on
Commit
•
9ce0232
1
Parent(s):
e96206b
Add end-to-end speaking demo
Browse files- README.md +8 -10
- app.py +44 -36
- gpt2-uk-conversational +1 -0
- requirements.txt +2 -1
README.md
CHANGED
@@ -13,19 +13,17 @@ This is a pet project with aim to provide an end-to-end voice chatbot with abili
|
|
13 |
|
14 |
It's a project with an aim to demonstrate current state-of-the-art speech technologies for Ukrainian language.
|
15 |
|
|
|
|
|
16 |
# Technologies used:
|
17 |
|
18 |
-
- [
|
19 |
-
- [
|
20 |
-
-
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
- [ ] Ability to change backends
|
25 |
-
- [ ] Support for Google Text-to-Speech/Speech-to-Text
|
26 |
-
- [ ] Echo backend (speaks recognized phrase)
|
27 |
-
- [ ] Other models
|
28 |
|
29 |
# How to setup:
|
30 |
|
31 |
-
|
|
|
|
13 |
|
14 |
It's a project with an aim to demonstrate current state-of-the-art speech technologies for Ukrainian language.
|
15 |
|
16 |
+
Link to speaking demo: [https://huggingface.co/spaces/robinhad/ukrainian-ai](https://huggingface.co/spaces/robinhad/ukrainian-ai)
|
17 |
+
Link to text demo: [https://huggingface.co/robinhad/gpt2-uk-conversational](https://huggingface.co/robinhad/gpt2-uk-conversational)
|
18 |
# Technologies used:
|
19 |
|
20 |
+
- [Wav2Vec2 XLS-R 300M fine-tuned to Ukrainian language](https://huggingface.co/Yehor/wav2vec2-xls-r-300m-uk-with-small-lm) for speech recognition.
|
21 |
+
- [Ukrainian VITS TTS](https://github.com/robinhad/ukrainian-tts) for text-to-speech generation.
|
22 |
+
- Conversational pipeline (this repository)
|
23 |
|
24 |
+
TODO: training scripts for conversational pipeline
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
# How to setup:
|
27 |
|
28 |
+
1. `pip install -r requirements.txt`
|
29 |
+
2. `python app.py`
|
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
-
import random
|
2 |
import gradio as gr
|
3 |
-
from transformers import pipeline
|
4 |
import tempfile
|
5 |
import torch
|
6 |
from os.path import exists
|
@@ -12,7 +11,7 @@ def download(url, file_name):
|
|
12 |
if not exists(file_name):
|
13 |
print(f"Downloading {file_name}")
|
14 |
r = requests.get(url, allow_redirects=True)
|
15 |
-
with open(file_name,
|
16 |
file.write(r.content)
|
17 |
else:
|
18 |
print(f"Found {file_name}. Skipping download...")
|
@@ -29,47 +28,56 @@ config_path = "config.json"
|
|
29 |
download(model_link, model_path)
|
30 |
download(config_link, config_path)
|
31 |
|
32 |
-
p = pipeline(
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
synthesizer = Synthesizer(
|
35 |
-
model_path,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
)
|
37 |
|
38 |
-
badge = "https://visitor-badge-reloaded.herokuapp.com/badge?page_id=robinhad.ukrainian-ai"
|
39 |
|
40 |
-
def transcribe(audio):
|
41 |
text = p(audio)["text"]
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
43 |
with torch.no_grad():
|
44 |
-
wavs = synthesizer.tts(
|
45 |
synthesizer.save_wav(wavs, fp)
|
46 |
-
return text, fp.name
|
47 |
-
|
48 |
-
gr.Interface(
|
49 |
-
fn=transcribe,
|
50 |
-
inputs=gr.inputs.Audio(source="microphone", type="filepath"),
|
51 |
-
outputs=[gr.outputs.Textbox(label="Recognized text"),gr.outputs.Audio(label="Output")],
|
52 |
-
article=f"<center><img src=\"{badge}\" alt=\"visitors badge\"/></center>",).launch()
|
53 |
|
54 |
-
def chat(message, history):
|
55 |
-
history = history or []
|
56 |
-
#if message.startswith("How many"):
|
57 |
-
# response = random.randint(1, 10)
|
58 |
-
#elif message.startswith("How"):
|
59 |
-
# response = random.choice(["Great", "Good", "Okay", "Bad"])
|
60 |
-
#elif message.startswith("Where"):
|
61 |
-
# response = random.choice(["Here", "There", "Somewhere"])
|
62 |
-
#else:
|
63 |
-
# response = "I don't know"
|
64 |
-
#history.append((message, response))
|
65 |
-
return history, history
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import Conversation, ConversationalPipeline, pipeline
|
3 |
import tempfile
|
4 |
import torch
|
5 |
from os.path import exists
|
|
|
11 |
if not exists(file_name):
|
12 |
print(f"Downloading {file_name}")
|
13 |
r = requests.get(url, allow_redirects=True)
|
14 |
+
with open(file_name, "wb") as file:
|
15 |
file.write(r.content)
|
16 |
else:
|
17 |
print(f"Found {file_name}. Skipping download...")
|
|
|
28 |
download(model_link, model_path)
|
29 |
download(config_link, config_path)
|
30 |
|
31 |
+
p = pipeline(
|
32 |
+
"automatic-speech-recognition", "Yehor/wav2vec2-xls-r-300m-uk-with-small-lm"
|
33 |
+
)
|
34 |
+
|
35 |
+
conv: ConversationalPipeline = pipeline(
|
36 |
+
"conversational", "robinhad/gpt2-uk-conversational", use_auth_token=True
|
37 |
+
)
|
38 |
|
39 |
synthesizer = Synthesizer(
|
40 |
+
model_path,
|
41 |
+
config_path,
|
42 |
+
None,
|
43 |
+
None,
|
44 |
+
None,
|
45 |
+
)
|
46 |
+
|
47 |
+
badge = (
|
48 |
+
"https://visitor-badge-reloaded.herokuapp.com/badge?page_id=robinhad.ukrainian-ai"
|
49 |
)
|
50 |
|
|
|
51 |
|
52 |
+
def transcribe(audio, history):
|
53 |
text = p(audio)["text"]
|
54 |
+
history = history or []
|
55 |
+
past_user_inputs = [i[0] for i in history]
|
56 |
+
generated_responses = [i[1] for i in history]
|
57 |
+
response = conv(Conversation(text, past_user_inputs, generated_responses))
|
58 |
+
response = response.generated_responses[-1]
|
59 |
+
history.append((text, response))
|
60 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
61 |
with torch.no_grad():
|
62 |
+
wavs = synthesizer.tts(response)
|
63 |
synthesizer.save_wav(wavs, fp)
|
64 |
+
return text, fp.name, history, history
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
iface = gr.Interface(
|
68 |
+
fn=transcribe,
|
69 |
+
inputs=[gr.inputs.Audio(source="microphone", type="filepath"), "state"],
|
70 |
+
outputs=[
|
71 |
+
gr.outputs.Textbox(label="Recognized text"),
|
72 |
+
gr.outputs.Audio(label="Output"),
|
73 |
+
gr.outputs.Chatbot(label="Chat"),
|
74 |
+
"state",
|
75 |
+
],
|
76 |
+
description="""Це альфа-версія end-to-end розмовного бота, з яким можна поспілкуватися голосом.
|
77 |
+
Перейдіть сюди для доступу до текстової версії: [https://huggingface.co/robinhad/gpt2-uk-conversational](https://huggingface.co/robinhad/gpt2-uk-conversational)
|
78 |
+
""",
|
79 |
+
article=f"""Розпізнавання української: [https://huggingface.co/Yehor/wav2vec2-xls-r-300m-uk-with-small-lm](https://huggingface.co/Yehor/wav2vec2-xls-r-300m-uk-with-small-lm)
|
80 |
+
Синтез української: [https://huggingface.co/spaces/robinhad/ukrainian-tts](https://huggingface.co/spaces/robinhad/ukrainian-tts)
|
81 |
+
<center><img src="{badge}" alt="visitors badge"/></center>""",
|
82 |
+
)
|
83 |
+
iface.launch()
|
gpt2-uk-conversational
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit c70fdb543d8bf0509e5787ce3a7e768ef52e6991
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ transformers==4.19.4
|
|
3 |
TTS==0.6.2
|
4 |
torch
|
5 |
pyctcdecode
|
6 |
-
https://github.com/kpu/kenlm/archive/master.zip
|
|
|
|
3 |
TTS==0.6.2
|
4 |
torch
|
5 |
pyctcdecode
|
6 |
+
https://github.com/kpu/kenlm/archive/master.zip
|
7 |
+
sentencepiece==0.1.96
|