File size: 7,787 Bytes
e5e9b34 4845dd7 e5e9b34 571e8a3 e5e9b34 4845dd7 e5e9b34 4845dd7 e5e9b34 2d1fac5 e5e9b34 4845dd7 e5e9b34 4845dd7 e5e9b34 4845dd7 db3d44a 4845dd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import torch
import scipy
import os
import streamlit as st
import pandas as pd
from transformers import set_seed, pipeline
from transformers import VitsTokenizer, VitsModel
from datasets import load_dataset, Audio
from src import *
#from huggingface_hub import login
#from dotenv import load_dotenv
#load_dotenv()
#HUGGINGFACE_KEY = os.environ.get("HUGGINGFACE_KEY")
#login(HUGGINGFACE_KEY)
########################
language_list = ['mos', 'fra', 'eng']
st.title("Demo: Automated Tools for Mooré Language")
tts, stt, trans, lid, about = st.tabs(["Text to speech", "Speech to text", "Translation", "Language ID", "**About**"])
########################
with tts:
tts_text = st.text_area(label = "Please enter your text here:", value="", placeholder="ne y wĩndga")
tts_col1, tts_col2, = st.columns(2)
with tts_col1:
tts_lang = st.selectbox('Language of text', (language_list), format_func = decode_iso)
if st.button("Speak"):
st.divider()
with st.spinner(":rainbow[Synthesizing, please wait...]"):
synth = synthesize_facebook(tts_text, tts_lang)
st.audio(synth, sample_rate=16_000)
########################
with stt:
stt_file = st.file_uploader("Please upload an audio file:", type=['mp3', 'm4a'], key = "stt_uploader")
stt_lang = st.selectbox("Please select the language:" , (language_list), format_func = decode_iso)
if st.button("Transcribe"):
st.divider()
with st.spinner(":rainbow[Received your file, please wait while I process it...]"):
stt = transcribe(stt_file, stt_lang)
":violet[The transcription is:]"
':violet[ "' + stt + '"]'
st.subheader("Examples")
"Using the supplied clips, here are the transcriptions:"
df = pd.read_csv("data/speech_to_text.csv")
df.columns = ['Clip ID', 'Spoken in Moore', 'Spoken in French', 'Transcription in Moore', 'Transcription in French']
df.set_index('Clip ID', inplace=True)
st.table(df[['Spoken in Moore', 'Transcription in Moore']])
st.table(df[['Spoken in French', 'Transcription in French']])
########################
with trans:
trans_text = st.text_area(label = "Please enter your translation text here:", value="", placeholder="ne y wĩndga")
#trans_col1, trans_col2, trans_col3 = st.columns([.25, .25, .5])
trans_col1, trans_col2 = st.columns(2)
with trans_col1:
src_lang = st.selectbox('Translate from:', (language_list), format_func = decode_iso)
with trans_col2:
target_lang = st.selectbox('Translate to:', (language_list), format_func = decode_iso, index=1)
#with trans_col3:
# trans_model = st.selectbox("Translation model:",
# ("Facebook (nllb-200-distilled-600M)",
# "Helsinki NLP (opus-mt-mos-en)",
# "Masakhane (m2m100_418m_mos_fr_news)")
# )
if st.button("Translate"):
st.divider()
with st.spinner(":rainbow[Translating from " + decode_iso(src_lang) + " into " + decode_iso(target_lang) + ", please wait...]"):
translation = translate(trans_text, src_lang, target_lang) #, trans_model)
translation
st.subheader("Examples")
"Using the supplied clips, here are the translations:"
df = pd.read_csv("data/translated_eng.csv",
usecols=['ID', 'French', 'Moore', 'English',
'tr_meta_mos_fra', 'tr_meta_mos_eng', 'tr_meta_eng_mos', 'tr_meta_fra_mos'])
df.columns = ['Clip ID', 'Original Moore', 'Original French', 'Original English',
'Moore-English Translation', 'Moore-French Translation',
'English-Moore Translation', 'French-Moore Translation']
df.set_index('Clip ID', inplace=True)
st.table(df[['Original Moore', 'Moore-French Translation', 'Moore-English Translation']])
st.table(df[['Original French', 'French-Moore Translation']])
st.table(df[['Original English', 'English-Moore Translation']])
########################
with lid:
langid_file = st.file_uploader("Please upload an audio file:", type=['mp3', 'm4a'], key = "lid_uploader")
if st.button("Identify"):
st.divider()
with st.spinner(":rainbow[Received your file, please wait while I process it...]"):
lang = identify_language(langid_file)
lang = decode_iso(lang)
":violet[The detected language is " + lang + "]"
st.subheader("Examples")
"Using the supplied clips, here are the recognized languages:"
df = pd.read_csv("data/language_id.csv")
df.columns = ['Clip ID', 'Language detected when speaking Mooré', 'Language detected when speaking French']
df.set_index('Clip ID', inplace=True)
st.dataframe(df)
# supported colors: blue, green, orange, red, violet, gray/grey, rainbow.
# https://docs.streamlit.io/library/api-reference/text/st.markdown
with about:
#st.header("How it works")
st.markdown('''
**Text to speech**, **speech to text**, and **language identification** capabilities are provided by Meta's [Massively Multilingual Speech (MMS)](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]
**Translation** capabilities are provided primarily by Meta's [No Language Left Behind (NLLB)](https://ai.meta.com/research/no-language-left-behind/) model, which supports translation between 200 languages.[^3]
We compare Meta's NLLB translations to two other translation alternatives. Masakhane, an African NLP initiative, offers endpoints for translations between Mooré and French.[^4] Helsinki NLP offers enpoints between Mooré and English, and one endpoint from French to Mooré.[^5]
Facebook has since released [SeamlessM4T](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t) which also provides support for audio-to-audio translation, however, Mooré is not currently one of the included languages.
[^1]: Endpoints used: TTS ([English](https://huggingface.co/facebook/mms-tts-eng),
[French](https://huggingface.co/facebook/mms-tts-fra),
[Mooré](https://huggingface.co/facebook/mms-tts-mos)),
[STT](https://huggingface.co/facebook/mms-1b-all),
[LID](https://huggingface.co/facebook/mms-lid-256). For language ID, the 256-language variant was chosen as this was the model with the smallest number of languages, which still included Mooré.
Learn more:
[Docs](https://huggingface.co/docs/transformers/model_doc/mms) |
[Paper](https://arxiv.org/abs/2305.13516) |
[Supported languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html)
[^3]: Endpoint used: [NLLB](https://huggingface.co/facebook/nllb-200-distilled-600M).
Learn more:
[Docs](https://huggingface.co/docs/transformers/model_doc/nllb) |
[Paper](https://huggingface.co/docs/transformers/model_doc/nllb) |
[Supported languages](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)
[^4]: Endpoint used: [Mooré to French](https://huggingface.co/masakhane/m2m100_418M_mos_fr_news),
[French to Mooré](https://huggingface.co/masakhane/m2m100_418M_fr_mos_news).
Learn more:
[Docs](https://github.com/masakhane-io/lafand-mt) |
[Paper](https://arxiv.org/abs/2205.02022)
[^5]: Endpoints used: [Mooré to English](https://huggingface.co/Helsinki-NLP/opus-mt-mos-en),
[English to Mooré](https://huggingface.co/Helsinki-NLP/opus-mt-en-mos),
[French to Mooré](https://huggingface.co/Helsinki-NLP/opus-mt-fr-mos).
Learn more:
[Docs](https://github.com/Helsinki-NLP/Opus-MT)
''') |