Initial commit of the app.
Browse files- moore_app.py +98 -0
- src/__init__.py +5 -0
- src/helpers.py +16 -0
- src/language_id.py +41 -0
- src/speech_to_text.py +45 -0
- src/text_to_speech.py +35 -0
- src/translation.py +136 -0
moore_app.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import scipy
|
3 |
+
import os
|
4 |
+
import streamlit as st
|
5 |
+
from transformers import set_seed, pipeline
|
6 |
+
from transformers import VitsTokenizer, VitsModel
|
7 |
+
from datasets import load_dataset, Audio
|
8 |
+
from IPython.display import Audio as Aud
|
9 |
+
from src import *
|
10 |
+
|
11 |
+
from huggingface_hub import login
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
|
14 |
+
#load_dotenv()
|
15 |
+
#HUGGINGFACE_KEY = os.environ.get("HUGGINGFACE_KEY")
|
16 |
+
#login(HUGGINGFACE_KEY)
|
17 |
+
|
18 |
+
|
19 |
+
########################
|
20 |
+
language_list = ['mos', 'fra', 'eng']
|
21 |
+
|
22 |
+
|
23 |
+
st.title("Demo: Automated Tools for Mooré Language")
|
24 |
+
tts, stt, trans, lid = st.tabs(["Text to speech", "Speech to text", "Translation", "Language ID"])
|
25 |
+
|
26 |
+
########################
|
27 |
+
with tts:
|
28 |
+
|
29 |
+
tts_text = st.text_area(label = "Please enter your text here:", value="", placeholder="ne y wĩndga")
|
30 |
+
|
31 |
+
tts_col1, tts_col2, = st.columns(2)
|
32 |
+
|
33 |
+
with tts_col1:
|
34 |
+
tts_lang = st.selectbox('Language of text', (language_list), format_func = decode_iso)
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
if st.button("Speak"):
|
39 |
+
st.divider()
|
40 |
+
with st.spinner(":rainbow[Synthesizing, please wait...]"):
|
41 |
+
synth = synthesize_facebook(tts_text, tts_lang)
|
42 |
+
st.audio(synth, sample_rate=16_000)
|
43 |
+
|
44 |
+
########################
|
45 |
+
with stt:
|
46 |
+
|
47 |
+
stt_file = st.file_uploader("Please upload an audio file:", type=['mp3', 'm4a'], key = "stt_uploader")
|
48 |
+
stt_lang = st.selectbox("Please select the language:" , (language_list), format_func = decode_iso)
|
49 |
+
|
50 |
+
|
51 |
+
if st.button("Transcribe"):
|
52 |
+
st.divider()
|
53 |
+
with st.spinner("rainbow[Received your file, please wait while I process it...]"):
|
54 |
+
stt = transcribe(stt_file, stt_lang)
|
55 |
+
":violet[The transcription is:]"
|
56 |
+
':violet[ "' + stt + '"]'
|
57 |
+
|
58 |
+
########################
|
59 |
+
with trans:
|
60 |
+
|
61 |
+
trans_text = st.text_area(label = "Please enter your translation text here:", value="", placeholder="ne y wĩndga")
|
62 |
+
#trans_col1, trans_col2, trans_col3 = st.columns([.25, .25, .5])
|
63 |
+
trans_col1, trans_col2 = st.columns(2)
|
64 |
+
|
65 |
+
with trans_col1:
|
66 |
+
src_lang = st.selectbox('Translate from:', (language_list), format_func = decode_iso)
|
67 |
+
with trans_col2:
|
68 |
+
target_lang = st.selectbox('Translate to:', (language_list), format_func = decode_iso, index=1)
|
69 |
+
#with trans_col3:
|
70 |
+
# trans_model = st.selectbox("Translation model:",
|
71 |
+
# ("Facebook (nllb-200-distilled-600M)",
|
72 |
+
# "Helsinki NLP (opus-mt-mos-en)",
|
73 |
+
# "Masakhane (m2m100_418m_mos_fr_news)")
|
74 |
+
# )
|
75 |
+
|
76 |
+
|
77 |
+
if st.button("Translate"):
|
78 |
+
st.divider()
|
79 |
+
with st.spinner(":rainbow[Translating from " + decode_iso(src_lang) + " into " + decode_iso(target_lang) + ", please wait...]"):
|
80 |
+
translation = translate(trans_text, src_lang, target_lang) #, trans_model)
|
81 |
+
translation
|
82 |
+
|
83 |
+
########################
|
84 |
+
with lid:
|
85 |
+
langid_file = st.file_uploader("Please upload an audio file:", type=['mp3', 'm4a'], key = "lid_uploader")
|
86 |
+
|
87 |
+
if st.button("Identify"):
|
88 |
+
st.divider()
|
89 |
+
with st.spinner(":rainbow[Received your file, please wait while I process it...]"):
|
90 |
+
lang = identify_language(langid_file)
|
91 |
+
lang = decode_iso(lang)
|
92 |
+
":violet[The detected language is " + lang + "]"
|
93 |
+
|
94 |
+
|
95 |
+
# supported colors: blue, green, orange, red, violet, gray/grey, rainbow.
|
96 |
+
# https://docs.streamlit.io/library/api-reference/text/st.markdown
|
97 |
+
|
98 |
+
|
src/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .helpers import *
|
2 |
+
from .language_id import *
|
3 |
+
from .speech_to_text import *
|
4 |
+
from .text_to_speech import *
|
5 |
+
from .translation import *
|
src/helpers.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
iso_encoder = {"English":"eng",
|
3 |
+
"French":"fra",
|
4 |
+
"Moore": "mos"}
|
5 |
+
|
6 |
+
iso_decoder = dict((v,k) for k,v in iso_encoder.items())
|
7 |
+
|
8 |
+
import pycountry
|
9 |
+
|
10 |
+
def encode_iso(lang:str)-> str:
|
11 |
+
''' Takes the name of a language and returns its ISO-3 code. '''
|
12 |
+
return iso_encoder[lang]
|
13 |
+
|
14 |
+
def decode_iso(iso:str)-> str:
|
15 |
+
''' Takes an ISO-3 code and returns the name of the language. '''
|
16 |
+
return pycountry.languages.get(alpha_3 = iso).name
|
src/language_id.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import librosa
|
3 |
+
import torch
|
4 |
+
from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
|
5 |
+
from transformers import set_seed
|
6 |
+
|
7 |
+
def identify_language(fp:str) -> str:
|
8 |
+
'''
|
9 |
+
For given audio file, identify what language it uses.
|
10 |
+
|
11 |
+
Parameters
|
12 |
+
----------
|
13 |
+
fp: str
|
14 |
+
The file path to the audio file.
|
15 |
+
|
16 |
+
Returns
|
17 |
+
----------
|
18 |
+
detected_lang:str
|
19 |
+
The iso3 code of the detected language.
|
20 |
+
|
21 |
+
'''
|
22 |
+
# Ensure replicability
|
23 |
+
set_seed(555)
|
24 |
+
|
25 |
+
# Load language ID model
|
26 |
+
model_id = "facebook/mms-lid-256" # Need to find the appropriate model for the language -- 256 languages is the first that contains MOS
|
27 |
+
processor = AutoFeatureExtractor.from_pretrained(model_id)
|
28 |
+
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id)
|
29 |
+
|
30 |
+
# Process the audio
|
31 |
+
signal, sampling_rate = librosa.load(fp, sr=16000)
|
32 |
+
inputs = processor(signal, sampling_rate=16_000, return_tensors="pt")
|
33 |
+
|
34 |
+
# Inference
|
35 |
+
with torch.no_grad():
|
36 |
+
outputs = model(**inputs).logits
|
37 |
+
|
38 |
+
lang_id = torch.argmax(outputs, dim=-1)[0].item()
|
39 |
+
detected_lang = model.config.id2label[lang_id]
|
40 |
+
|
41 |
+
return detected_lang
|
src/speech_to_text.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import librosa
|
3 |
+
import torch
|
4 |
+
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
5 |
+
from transformers import set_seed
|
6 |
+
|
7 |
+
|
8 |
+
def transcribe(fp:str, target_lang:str) -> str:
|
9 |
+
'''
|
10 |
+
For given audio file, transcribe it.
|
11 |
+
|
12 |
+
Parameters
|
13 |
+
----------
|
14 |
+
fp: str
|
15 |
+
The file path to the audio file.
|
16 |
+
target_lang:str
|
17 |
+
The ISO-3 code of the target language.
|
18 |
+
|
19 |
+
Returns
|
20 |
+
----------
|
21 |
+
transcript:str
|
22 |
+
The transcribed text.
|
23 |
+
'''
|
24 |
+
# Ensure replicability
|
25 |
+
set_seed(555)
|
26 |
+
|
27 |
+
# Load transcription model
|
28 |
+
model_id = "facebook/mms-1b-all"
|
29 |
+
target_lang = "mos"
|
30 |
+
|
31 |
+
processor = AutoProcessor.from_pretrained(model_id, target_lang=target_lang)
|
32 |
+
model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True)
|
33 |
+
|
34 |
+
# Process the audio
|
35 |
+
signal, sampling_rate = librosa.load(fp, sr=16000)
|
36 |
+
inputs = processor(signal, sampling_rate=16_000, return_tensors="pt")
|
37 |
+
|
38 |
+
# Inference
|
39 |
+
with torch.no_grad():
|
40 |
+
outputs = model(**inputs).logits
|
41 |
+
|
42 |
+
ids = torch.argmax(outputs, dim=-1)[0]
|
43 |
+
transcript = processor.decode(ids)
|
44 |
+
|
45 |
+
return transcript
|
src/text_to_speech.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
from transformers import set_seed
|
4 |
+
from transformers import VitsTokenizer, VitsModel
|
5 |
+
from IPython.display import Audio as Aud
|
6 |
+
|
7 |
+
def synthesize_facebook(s:str, iso3:str) -> str:
|
8 |
+
'''
|
9 |
+
For given text, speak it.
|
10 |
+
|
11 |
+
Parameters
|
12 |
+
----------
|
13 |
+
s: str
|
14 |
+
The written text.
|
15 |
+
is03:str
|
16 |
+
The ISO-3 code of the text's language.
|
17 |
+
|
18 |
+
Returns
|
19 |
+
----------
|
20 |
+
synth:str
|
21 |
+
The synthesized audio.
|
22 |
+
'''
|
23 |
+
# Load synthesizer
|
24 |
+
tokenizer = VitsTokenizer.from_pretrained(f"facebook/mms-tts-{iso3}")
|
25 |
+
model = VitsModel.from_pretrained(f"facebook/mms-tts-{iso3}")
|
26 |
+
|
27 |
+
inputs = tokenizer(text=s, return_tensors="pt")
|
28 |
+
|
29 |
+
# Inference
|
30 |
+
with torch.no_grad():
|
31 |
+
outputs = model(**inputs)
|
32 |
+
|
33 |
+
synth = outputs.waveform[0]
|
34 |
+
|
35 |
+
return synth.numpy()
|
src/translation.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import set_seed, pipeline
|
3 |
+
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
4 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
5 |
+
|
6 |
+
######### HELSINKI NLP ##################
|
7 |
+
def translate_helsinki_nlp(s:str, src_iso:str, dest_iso:str)-> str:
|
8 |
+
'''
|
9 |
+
Translate the text using HelsinkiNLP's Opus models for Mossi language.
|
10 |
+
|
11 |
+
Parameters
|
12 |
+
----------
|
13 |
+
s: str
|
14 |
+
The text
|
15 |
+
src_iso:
|
16 |
+
The ISO-3 code of the source language
|
17 |
+
dest_iso:
|
18 |
+
The ISO-3 code of the destination language
|
19 |
+
|
20 |
+
Returns
|
21 |
+
----------
|
22 |
+
translation:str
|
23 |
+
The translated text
|
24 |
+
'''
|
25 |
+
# Ensure replicability
|
26 |
+
set_seed(555)
|
27 |
+
|
28 |
+
# Inference
|
29 |
+
translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-{src_iso}-{dest_iso}")
|
30 |
+
translation = translator(s)[0]['translation_text']
|
31 |
+
|
32 |
+
return translation
|
33 |
+
|
34 |
+
######### MASAKHANE ##################
|
35 |
+
def translate_masakhane(s:str, src_iso:str, dest_iso:str)-> str:
|
36 |
+
'''
|
37 |
+
Translate the text using Masakhane's M2M models for Mossi language.
|
38 |
+
|
39 |
+
Parameters
|
40 |
+
----------
|
41 |
+
s: str
|
42 |
+
The text
|
43 |
+
src_iso:
|
44 |
+
The ISO-3 code of the source language
|
45 |
+
dest_iso:
|
46 |
+
The ISO-3 code of the destination language
|
47 |
+
|
48 |
+
Returns
|
49 |
+
----------
|
50 |
+
translation:str
|
51 |
+
The translated text
|
52 |
+
'''
|
53 |
+
# Ensure replicability
|
54 |
+
set_seed(555)
|
55 |
+
|
56 |
+
# Load model
|
57 |
+
model = M2M100ForConditionalGeneration.from_pretrained(f"masakhane/m2m100_418m_{src_iso}_{dest_iso}_news")
|
58 |
+
tokenizer = M2M100Tokenizer.from_pretrained(f"masakhane/m2m100_418m_{src_iso}_{dest_iso}_news")
|
59 |
+
|
60 |
+
# Inference
|
61 |
+
encoded = tokenizer(s, return_tensors="pt")
|
62 |
+
generated_tokens = model.generate(**encoded)
|
63 |
+
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
64 |
+
|
65 |
+
return translation
|
66 |
+
|
67 |
+
######### META ##################
|
68 |
+
def translate_facebook(s:str, src_iso:str, dest_iso:str)-> str:
|
69 |
+
'''
|
70 |
+
Translate the text using Meta's NLLB model for Mossi language.
|
71 |
+
|
72 |
+
Parameters
|
73 |
+
----------
|
74 |
+
s: str
|
75 |
+
The text
|
76 |
+
src_iso:
|
77 |
+
The ISO-3 code of the source language
|
78 |
+
dest_iso:
|
79 |
+
The ISO-3 code of the destination language
|
80 |
+
|
81 |
+
Returns
|
82 |
+
----------
|
83 |
+
translation:str
|
84 |
+
The translated text
|
85 |
+
'''
|
86 |
+
# Ensure replicability
|
87 |
+
set_seed(555)
|
88 |
+
|
89 |
+
# Load model
|
90 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", use_auth_token=True, src_lang=f"{src_iso}_Latn")
|
91 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", use_auth_token=True)
|
92 |
+
|
93 |
+
# Inference
|
94 |
+
encoded = tokenizer(s, return_tensors="pt")
|
95 |
+
translated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.lang_code_to_id[f"{dest_iso}_Latn"], max_length=30)
|
96 |
+
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
|
97 |
+
|
98 |
+
return translation
|
99 |
+
|
100 |
+
|
101 |
+
######### ALL OF THE ABOVE ##################
|
102 |
+
def translate(s, src_iso, dest_iso):
|
103 |
+
'''
|
104 |
+
Translate the text using all available models (Meta, Masakhane, and Helsinki NLP where applicable).
|
105 |
+
|
106 |
+
Parameters
|
107 |
+
----------
|
108 |
+
s: str
|
109 |
+
The text
|
110 |
+
src_iso:
|
111 |
+
The ISO-3 code of the source language
|
112 |
+
dest_iso:
|
113 |
+
The ISO-3 code of the destination language
|
114 |
+
|
115 |
+
Returns
|
116 |
+
----------
|
117 |
+
translation:str
|
118 |
+
The translated text, concatenated over different models
|
119 |
+
'''
|
120 |
+
# Translate with Meta NLLB
|
121 |
+
translation= "Meta's NLLB translation is:\n\n" + translate_facebook(s, src_iso, dest_iso)
|
122 |
+
|
123 |
+
# Check if the ISO pair is supported by another model and if so, add to translation
|
124 |
+
iso_pair = f"{src_iso}-{dest_iso}"
|
125 |
+
if iso_pair in ["mos-eng", 'eng-mos', 'fra-mos']:
|
126 |
+
src_iso = src_iso.lower().replace("eng", "en").replace("fra", "fr")
|
127 |
+
dest_iso = dest_iso.replace("eng", "en").replace("fra", "fr")
|
128 |
+
translation+= f"\n\n\nHelsinkiNLP's Opus translation is:\n\n {translate_helsinki_nlp(s, src_iso, dest_iso)}"
|
129 |
+
|
130 |
+
if iso_pair in ["mos-fra", "fra-mos"]:
|
131 |
+
src_iso = src_iso.lower().replace("fra", "fr")
|
132 |
+
dest_iso = dest_iso.replace("fra", "fr")
|
133 |
+
translation+= "\n\n\nMasakhane's M2M translation is:\n\n" + translate_masakhane(s, src_iso, dest_iso)
|
134 |
+
|
135 |
+
return translation
|
136 |
+
|