Spaces:

OOI-FrontierTech
/

tts_mockingbird

Running

App Files Files Community

tts_mockingbird / app.py

khof312

Add support for IMS Toucan.

593cb11 4 months ago

raw

history blame

11.2 kB

	import torch
	import scipy
	import os
	import streamlit as st
	import pandas as pd
	from transformers import pipeline #set_seed,
	from transformers import VitsTokenizer, VitsModel
	from datasets import load_dataset, Audio
	from huggingface_hub.inference_api import InferenceApi

	from src import *


	########################
	col1, col2 = st.columns([20,3])
	with col2:
	st.image('logo.png', use_column_width=True)
	with col1:
	st.title("Mockingbird")
	st.header("A demo of open Text to Speech tools")

	tts, about = st.tabs(["Text to speech", "About"])

	########################
	with tts:

	# Configurations -- language choice and text
	tts_lang = st.selectbox('Language of text', (language_list), format_func = decode_iso)
	tts_text = st.text_area(label = "Please enter your sentence here:",
	value="", placeholder=placeholders[tts_lang] )

	target_speaker_file = st.file_uploader("If you would like to test voice conversion, you may upload your audio below. You should upload one file in .wav format. If you don't, a default file will be used.",
	type=['wav'])

	# Inference
	if st.button("Generate"):

	# Warning about alphabet support
	if tts_lang in ['rus', 'fas']:
	st.warning("WARNING! On Windows, ESpeak-NG has trouble synthesizing output when input is provided from non-Latin alphabets.")

	st.divider()

	# Synthesis
	with st.spinner(":rainbow[Synthesizing, please wait... (this will be slowest the first time you generate audio in a new language)]"):
	if tts_text == "":
	tts_text=placeholders[tts_lang]

	# First, make the audio
	base_mms = synth_mms(tts_text, models[tts_lang]['mms'])
	base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui'])
	base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng'])
	base_toucan= synth_toucan(tts_text, models[tts_lang]['toucan'])

	if tts_lang=="swh":
	finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1")
	finetuned_mms2 = synth_mms(tts_text, "khof312/mms-tts-swh-female-2")
	if tts_lang=="spa":
	finetuned_mms1 = synth_mms(tts_text, "ylacombe/mms-spa-finetuned-argentinian-monospeaker")
	finetuned_mms2 = synth_mms(tts_text, "ylacombe/mms-spa-finetuned-chilean-monospeaker")
	finetuned_mms3 = synth_mms(tts_text, "ylacombe/mms-spa-finetuned-colombian-monospeaker")

	#vc_mms
	#vc_coqui
	#vc_espeakng
	"## Synthesis"
	"### Default models"
	row1 = st.columns([1,1,2])
	row2 = st.columns([1,1,2])
	row3 = st.columns([1,1,2])
	row4 = st.columns([1,1,2])
	row5 = st.columns([1,1,2])

	row1[0].write("Model")
	row1[1].write("Configuration")
	row1[2].write("Audio")

	if base_mms is not None:
	row2[0].write(f"[Meta MMS](https://huggingface.co/docs/transformers/main/en/model_doc/mms)")
	row2[1].write("default")
	row2[2].audio(base_mms[0], sample_rate = base_mms[1])

	if base_coqui is not None:
	row3[0].write(f"[Coqui](https://docs.coqui.ai/en/latest/index.html)")
	row3[1].write("default")
	row3[2].audio(base_coqui[0], sample_rate = base_coqui[1])

	if base_espeakng is not None:
	row4[0].write(f"[Espeak-ng](https://github.com/espeak-ng/espeak-ng)")
	row4[1].write("default")
	row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1])


	row5[0].write(f"[IMS-Toucan](https://github.com/DigitalPhonetics/IMS-Toucan)")
	row5[1].write("default")
	row5[2].audio(base_toucan[0], sample_rate = base_toucan[1])


	#################################################################
	if tts_lang == "swh":
	"### Fine Tuned"
	row1 = st.columns([1,1,2])
	row2 = st.columns([1,1,2])
	row3 = st.columns([1,1,2])

	row1[0].write("Model")
	row1[1].write("Configuration")
	row1[2].write("Audio")

	row2[0].write(f"Meta MMS")
	row2[1].write("[female 1](https://huggingface.co/khof312/mms-tts-swh-female-1)")
	row2[2].audio(finetuned_mms1[0], sample_rate = finetuned_mms1[1])
	row3[0].write(f"Meta MMS")
	row3[1].write("[female 2](https://huggingface.co/khof312/mms-tts-swh-female-2)")
	row3[2].audio(finetuned_mms2[0], sample_rate = finetuned_mms2[1])


	if tts_lang == "spa":
	"### Fine Tuned"
	row1 = st.columns([1,1,2])
	row2 = st.columns([1,1,2])
	row3 = st.columns([1,1,2])
	row4 = st.columns([1,1,2])

	row1[0].write("Model")
	row1[1].write("Configuration")
	row1[2].write("Audio")

	row2[0].write(f"Meta MMS")
	row2[1].write("[ylacombe - Argentinian](https://huggingface.co/ylacombe/mms-spa-finetuned-argentinian-monospeaker)")
	row2[2].audio(finetuned_mms1[0], sample_rate = finetuned_mms1[1])
	row3[0].write(f"Meta MMS")
	row3[1].write("[ylacombe - Chilean](https://huggingface.co/ylacombe/mms-spa-finetuned-chilean-monospeaker)")
	row3[2].audio(finetuned_mms2[0], sample_rate = finetuned_mms2[1])
	row4[0].write(f"Meta MMS")
	row4[1].write("[ylacombe - Colombian](https://huggingface.co/ylacombe/mms-spa-finetuned-colombian-monospeaker)")
	row4[2].audio(finetuned_mms3[0], sample_rate = finetuned_mms3[1])

	st.divider()

	"## Voice conversion" #################################################################



	st.warning('''Note: The naturalness of the audio will only be as good as that of the audio in "default models" above.''')

	if target_speaker_file is not None:
	rate, wav = scipy.io.wavfile.read(target_speaker_file)
	scipy.io.wavfile.write("target_speaker_custom.wav", data=wav, rate=rate)
	target_speaker = "target_speaker_custom.wav"
	else:
	target_speaker = "target_speaker.wav"

	if base_mms is not None:
	scipy.io.wavfile.write("source_speaker_mms.wav", rate=base_mms[1], data=base_mms[0].T)
	converted_mms = convert_coqui('source_speaker_mms.wav', target_speaker)

	if base_coqui is not None:
	scipy.io.wavfile.write("source_speaker_coqui.wav", rate=base_coqui[1], data=base_coqui[0].T)
	converted_coqui = convert_coqui('source_speaker_coqui.wav', target_speaker)

	if base_espeakng is not None:
	scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T)
	converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker)

	scipy.io.wavfile.write("source_speaker_toucan.wav", rate=base_toucan[1], data=base_toucan[0].T)
	converted_toucan = convert_coqui('source_speaker_toucan.wav', target_speaker)

	row1 = st.columns([1,1,2])
	row2 = st.columns([1,1,2])
	row3 = st.columns([1,1,2])
	row4 = st.columns([1,1,2])

	row1[0].write("Model")
	row1[1].write("Configuration")
	row1[2].write("Audio")

	if base_mms is not None:
	row1[0].write(f"Meta MMS")
	row1[1].write(f"converted")
	row1[2].audio(converted_mms[0], sample_rate = converted_mms[1])

	if base_coqui is not None:
	row2[0].write(f"Coqui")
	row2[1].write(f"converted")
	row2[2].audio(converted_coqui[0], sample_rate = converted_coqui[1])

	if base_espeakng is not None:
	row3[0].write(f"Espeak-ng")
	row3[1].write(f"converted")
	row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1])


	row4[0].write(f"IMS Toucan")
	row4[1].write(f"converted")
	row4[2].audio(converted_toucan[0], sample_rate = converted_toucan[1])


	#row3[0].write("MMS-TTS-SWH")
	#row3[1].audio(synth, sample_rate=16_000)
	#row3[2].audio(synth, sample_rate=16_000)

	#st.audio(synth, sample_rate=16_000)
	#data.write(np.random.randn(10, 1)


	#col1.subheader("A wide column with a chart")
	#col1.line_chart(data)

	#col2.subheader("A narrow column with the data")
	#col2.write(data)

	with about:
	#st.header("How it works")
	st.markdown('''# Mockingbird TTS Demo
	This page is a demo of the openly available Text to Speech models for various languages of interest. Currently, 4 synthesizers are supported:
	- [Meta's Massively Multilingual Speech (MMS)](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]
	- [Coqui's TTS](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available.
	- [ESpeak-NG's](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3]
	- [IMS Toucan](https://github.com/DigitalPhonetics/IMS-Toucan), which supports 7000 languages. [^4]

	Voice conversion is currently achieved through Coqui.

	Notes:
	1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
	2. Where a synthesizer supports multiple models/voices, I manually pick the appropriate model.
	3. Not all synthesizers support a given language.



	[^1]: Endpoints used are of the form https://huggingface.co/facebook/mms-tts-[LANG].
	Learn more:
	[Docs](https://huggingface.co/docs/transformers/model_doc/mms) \|
	[Paper](https://arxiv.org/abs/2305.13516) \|
	[Supported languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html)

	[^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json)
	[^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
	[^4]: Language list is available in the Gradio API documentation [here](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS).
	''')