File size: 7,787 Bytes
e5e9b34
 
 
 
4845dd7
e5e9b34
 
 
 
 
571e8a3
 
e5e9b34
 
 
 
 
 
 
 
 
 
 
4845dd7
e5e9b34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4845dd7
e5e9b34
 
 
 
 
 
 
 
 
2d1fac5
e5e9b34
 
 
 
4845dd7
 
 
 
 
 
 
 
 
 
e5e9b34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4845dd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5e9b34
 
 
 
 
 
 
 
 
 
 
4845dd7
 
 
 
 
 
 
 
 
 
 
 
 
 
db3d44a
4845dd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import torch
import scipy
import os
import streamlit as st
import pandas as pd
from transformers import set_seed, pipeline
from transformers import VitsTokenizer, VitsModel
from datasets import load_dataset, Audio
from src import *

#from huggingface_hub import login
#from dotenv import load_dotenv

#load_dotenv()
#HUGGINGFACE_KEY = os.environ.get("HUGGINGFACE_KEY")
#login(HUGGINGFACE_KEY)


########################
language_list = ['mos', 'fra', 'eng']


st.title("Demo: Automated Tools for Mooré Language")
tts, stt, trans, lid, about = st.tabs(["Text to speech", "Speech to text", "Translation", "Language ID", "**About**"])

########################
with tts:
    
    tts_text = st.text_area(label = "Please enter your text here:", value="", placeholder="ne y wĩndga")

    tts_col1, tts_col2,  = st.columns(2)

    with tts_col1:
        tts_lang = st.selectbox('Language of text', (language_list), format_func = decode_iso)
    
    

    if st.button("Speak"):
        st.divider()
        with st.spinner(":rainbow[Synthesizing, please wait...]"):
            synth = synthesize_facebook(tts_text, tts_lang)
            st.audio(synth, sample_rate=16_000)


########################
with stt:

    stt_file = st.file_uploader("Please upload an audio file:", type=['mp3', 'm4a'], key = "stt_uploader")
    stt_lang = st.selectbox("Please select the language:" , (language_list), format_func = decode_iso)


    if st.button("Transcribe"):
        st.divider()
        with st.spinner(":rainbow[Received your file, please wait while I process it...]"):
            stt = transcribe(stt_file, stt_lang)
            ":violet[The transcription is:]" 
            ':violet[ "' + stt + '"]'

    st.subheader("Examples")
    "Using the supplied clips, here are the transcriptions:"
    df = pd.read_csv("data/speech_to_text.csv")
    df.columns = ['Clip ID', 'Spoken in Moore', 'Spoken in French', 'Transcription in Moore', 'Transcription in French']
    
    df.set_index('Clip ID', inplace=True)
    st.table(df[['Spoken in Moore', 'Transcription in Moore']])
    
    st.table(df[['Spoken in French', 'Transcription in French']])

########################
with trans:
    
    trans_text = st.text_area(label = "Please enter your translation text here:", value="", placeholder="ne y wĩndga")
    #trans_col1, trans_col2, trans_col3 = st.columns([.25, .25, .5])
    trans_col1, trans_col2 = st.columns(2)

    with trans_col1:
        src_lang = st.selectbox('Translate from:', (language_list), format_func = decode_iso)
    with trans_col2:
        target_lang = st.selectbox('Translate to:', (language_list), format_func = decode_iso, index=1)
    #with trans_col3:
    #    trans_model = st.selectbox("Translation model:",
    #                            ("Facebook (nllb-200-distilled-600M)", 
    #                             "Helsinki NLP (opus-mt-mos-en)", 
    #                             "Masakhane (m2m100_418m_mos_fr_news)")
    #                           )
    
    
    if st.button("Translate"):
        st.divider()
        with st.spinner(":rainbow[Translating from " + decode_iso(src_lang) + " into " + decode_iso(target_lang) + ", please wait...]"):
            translation = translate(trans_text, src_lang, target_lang) #, trans_model)
            translation



    st.subheader("Examples")
    "Using the supplied clips, here are the translations:"
    df = pd.read_csv("data/translated_eng.csv",
                    usecols=['ID', 'French', 'Moore', 'English', 
                             'tr_meta_mos_fra', 'tr_meta_mos_eng', 'tr_meta_eng_mos', 'tr_meta_fra_mos'])
    
    df.columns = ['Clip ID',  'Original Moore', 'Original French', 'Original English',
                         'Moore-English Translation', 'Moore-French Translation', 
                     'English-Moore Translation', 'French-Moore Translation']
    
    df.set_index('Clip ID', inplace=True)
    
    st.table(df[['Original Moore', 'Moore-French Translation', 'Moore-English Translation']])
    st.table(df[['Original French', 'French-Moore Translation']])
    st.table(df[['Original English', 'English-Moore Translation']])

########################
with lid:
    langid_file = st.file_uploader("Please upload an audio file:", type=['mp3', 'm4a'], key = "lid_uploader")

    if st.button("Identify"):
        st.divider()
        with st.spinner(":rainbow[Received your file, please wait while I process it...]"):
            lang = identify_language(langid_file)
            lang = decode_iso(lang)
            ":violet[The detected language is " + lang + "]"

    st.subheader("Examples")
    "Using the supplied clips, here are the recognized languages:"
    df = pd.read_csv("data/language_id.csv")
    df.columns = ['Clip ID', 'Language detected when speaking Mooré', 'Language detected when speaking French']
    df.set_index('Clip ID', inplace=True)
    st.dataframe(df)


    # supported colors: blue, green, orange, red, violet, gray/grey, rainbow.
    # https://docs.streamlit.io/library/api-reference/text/st.markdown

with about:
    #st.header("How it works")
    st.markdown('''
**Text to speech**, **speech to text**, and **language identification** capabilities are provided by Meta's [Massively Multilingual Speech (MMS)](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]

**Translation** capabilities are provided primarily by Meta's [No Language Left Behind (NLLB)](https://ai.meta.com/research/no-language-left-behind/) model, which supports translation between 200 languages.[^3]
We compare Meta's NLLB translations to two other translation alternatives. Masakhane, an African NLP initiative, offers endpoints for translations between Mooré and French.[^4] Helsinki NLP offers enpoints between Mooré and English, and one endpoint from French to Mooré.[^5]

Facebook has since released [SeamlessM4T](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t) which also provides support for audio-to-audio translation, however, Mooré is not currently one of the included languages.
[^1]: Endpoints used: TTS ([English](https://huggingface.co/facebook/mms-tts-eng), 
    [French](https://huggingface.co/facebook/mms-tts-fra), 
    [Mooré](https://huggingface.co/facebook/mms-tts-mos)),
    [STT](https://huggingface.co/facebook/mms-1b-all), 
    [LID](https://huggingface.co/facebook/mms-lid-256).  For language ID, the 256-language variant was chosen as this was the model with the smallest number of languages, which still included Mooré.   
    Learn more:
    [Docs](https://huggingface.co/docs/transformers/model_doc/mms) | 
    [Paper](https://arxiv.org/abs/2305.13516) | 
    [Supported languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html)
[^3]: Endpoint used: [NLLB](https://huggingface.co/facebook/nllb-200-distilled-600M).   
    Learn more: 
    [Docs](https://huggingface.co/docs/transformers/model_doc/nllb) | 
    [Paper](https://huggingface.co/docs/transformers/model_doc/nllb) | 
    [Supported languages](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)
[^4]: Endpoint used: [Mooré to French](https://huggingface.co/masakhane/m2m100_418M_mos_fr_news), 
    [French to Mooré](https://huggingface.co/masakhane/m2m100_418M_fr_mos_news).   
    Learn more:
    [Docs](https://github.com/masakhane-io/lafand-mt) |
    [Paper](https://arxiv.org/abs/2205.02022)
[^5]: Endpoints used: [Mooré to English](https://huggingface.co/Helsinki-NLP/opus-mt-mos-en),
    [English to Mooré](https://huggingface.co/Helsinki-NLP/opus-mt-en-mos),
    [French to Mooré](https://huggingface.co/Helsinki-NLP/opus-mt-fr-mos).   
    Learn more:
    [Docs](https://github.com/Helsinki-NLP/Opus-MT) 
''')