Spaces:

Maoweicao
/

xttsv2

Runtime error

App Files Files Community

xttsv2 / app.py

Maoweicao

init file

ca34e42 12 months ago

raw

history blame contribute delete

15 kB

	import sys
	import os,stat
	import subprocess
	import random
	from zipfile import ZipFile
	import uuid

	# By using XTTS you agree to CPML license https://coqui.ai/cpml
	os.environ["COQUI_TOS_AGREED"] = "1"

	# langid is used to detect language for longer text
	# Most users expect text to be their own language, there is checkbox to disable it
	import langid

	import gradio as gr
	from TTS.api import TTS
	from TTS.tts.configs.xtts_config import XttsConfig
	from TTS.tts.models.xtts import Xtts
	from TTS.utils.generic_utils import get_user_data_dir
	HF_TOKEN = os.environ.get("HF_TOKEN")
	from huggingface_hub import HfApi
	# will use api to restart space on a unrecoverable error
	api = HfApi(token=HF_TOKEN)
	repo_id = "coqui/xtts"

	# Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
	print("Export newer ffmpeg binary for denoise filter")
	ZipFile("ffmpeg.zip").extractall()
	print("Make ffmpeg binary executable")
	st = os.stat('ffmpeg')
	os.chmod('ffmpeg', st.st_mode \| stat.S_IEXEC)

	# Load TTS
	from TTS.utils.manage import ModelManager
	import torch
	model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
	ModelManager().download_model(model_name)
	model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
	print("XTTS downloaded")
	tts = TTS(model_name)
	if torch.cuda.is_available():
	tts.to("cuda")
	else:
	tts.to("cpu")

	# This is for debugging purposes only
	DEVICE_ASSERT_DETECTED=0
	DEVICE_ASSERT_PROMPT=None
	DEVICE_ASSERT_LANG=None

	def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
	if agree == True:
	supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja","ko","hu"]

	if language not in supported_languages:
	gr.Warning("Language you put in is not in is not in our Supported Languages, please choose from dropdown")

	return (
	None,
	None,
	None,
	)

	language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!

	# tts expects chinese as zh-cn
	if language_predicted == "zh":
	#we use zh-cn
	language_predicted = "zh-cn"
	print(f"Detected language:{language_predicted}, Chosen language:{language}")

	# After text character length 15 trigger language detection
	if len(prompt)>15:
	# allow any language for short text as some may be common
	# If user unchecks language autodetection it will not trigger
	# You may remove this completely for own use
	if language_predicted != language and not no_lang_auto_detect:
	#Please duplicate and remove this check if you really want this
	#Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
	gr.Warning(f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox" )

	return (
	None,
	None,
	None,
	)


	if use_mic == True:
	if mic_file_path is not None:
	speaker_wav=mic_file_path
	else:
	gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
	return (
	None,
	None,
	None,
	)

	else:
	speaker_wav=audio_file_pth


	# Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
	# This is fast filtering not perfect

	# Apply all on demand
	lowpassfilter=denoise=trim=loudness=True

	if lowpassfilter:
	lowpass_highpass="lowpass=8000,highpass=75,"
	else:
	lowpass_highpass=""

	if trim:
	# better to remove silence in beginning and end for microphone
	trim_silence="areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
	else:
	trim_silence=""

	if (voice_cleanup):
	try:
	out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format

	#we will use newer ffmpeg as that has afftn denoise filter
	shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(" ")

	command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
	speaker_wav=out_filename
	print("Filtered microphone input")
	except subprocess.CalledProcessError:
	# There was an error - command exited with non-zero code
	print("Error: failed filtering, use original microphone input")
	else:
	speaker_wav=speaker_wav

	if len(prompt)<2:
	gr.Warning("Please give a longer prompt text")
	return (
	None,
	None,
	None,
	)
	if len(prompt)>200:
	gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
	return (
	None,
	None,
	None,
	)
	global DEVICE_ASSERT_DETECTED
	if DEVICE_ASSERT_DETECTED:
	global DEVICE_ASSERT_PROMPT
	global DEVICE_ASSERT_LANG
	#It will likely never come here as we restart space on first unrecoverable error now
	print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")

	try:
	tts.tts_to_file(
	text=prompt,
	file_path="output.wav",
	language=language,
	speaker_wav=speaker_wav,
	)
	except RuntimeError as e :
	if "device-side assert" in str(e):
	# cannot do anything on cuda device side error, need tor estart
	print(f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}", flush=True)
	gr.Warning("Unhandled Exception encounter, please retry in a minute")
	print("Cuda device-assert Runtime encountered need restart")
	if not DEVICE_ASSERT_DETECTED:
	DEVICE_ASSERT_DETECTED=1
	DEVICE_ASSERT_PROMPT=prompt
	DEVICE_ASSERT_LANG=language


	# HF Space specific.. This error is unrecoverable need to restart space
	api.restart_space(repo_id=repo_id)
	else:
	print("RuntimeError: non device-side assert error:", str(e))
	raise e
	return (
	gr.make_waveform(
	audio="output.wav",
	),
	"output.wav",
	speaker_wav,
	)
	else:
	gr.Warning("Please accept the Terms & Condition!")
	return (
	None,
	None,
	None,
	)


	title = "🐸 XTTSv2 - 3秒语音合成，支持中英双语，告别电音！"

	description = f"""
	## <center>🌟 - 只需上传3~10秒语音，支持13种语言，中文能力极大增强！</center>
	### <center>🤗 - 使用[Colab笔记本](https://github.com/KevinWang676/Bark-Voice-Cloning)运行；Powered by [Coqui AI](https://coqui.ai/)</center>
	### <center>🌊 - 更多精彩应用，尽在[滔滔AI](http://www.talktalkai.com)；滔滔AI，为爱滔滔！💕</center>
	### <center>😺️☘️ - 猫尾草修改版 - coqui xTTS v2</center>
	### <center>女声示例 - The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory.</center>
	### <center>男声示例 - It is a pretty little spot there: a grass plateau, running along by the water's edge, and overhung by willows. </center>
	""".strip()


	article = """
	<div style='margin:20px auto;'>
	<p>注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。</p>
	<p>🌊🏞️🎶 - 江水东流急，滔滔无尽声。明·顾璘</p>
	</div>
	"""
	examples = [
	[
	"Once when I was six years old I saw a magnificent picture",
	"en",
	"examples/female.wav",
	None,
	False,
	False,
	False,
	True,

	],
	[
	"Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
	"fr",
	"examples/male.wav",
	None,
	False,
	False,
	False,
	True,
	False,
	],
	[
	"Als ich sechs war, sah ich einmal ein wunderbares Bild",
	"de",
	"examples/female.wav",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"Cuando tenía seis años, vi una vez una imagen magnífica",
	"es",
	"examples/male.wav",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
	"pt",
	"examples/female.wav",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
	"pl",
	"examples/male.wav",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
	"it",
	"examples/female.wav",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
	"tr",
	"examples/female.wav",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"Когда мне было шесть лет, я увидел однажды удивительную картинку",
	"ru",
	"examples/female.wav",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
	"nl",
	"examples/male.wav",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
	"cs",
	"examples/female.wav",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"当我还只有六岁的时候，看到了一副精彩的插画",
	"zh-cn",
	"examples/female.wav",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"かつて六歳のとき、素晴らしい絵を見ました",
	"ja",
	"examples/female.wav",
	None,
	False,
	True,
	False,
	True,
	],
	[
	"한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
	"ko",
	"examples/female.wav",
	None,
	False,
	True,
	False,
	True,
	],
	[
	"Egyszer hat éves koromban láttam egy csodálatos képet",
	"hu",
	"examples/male.wav",
	None,
	False,
	True,
	False,
	True,
	],
	[
	"当我还只有六岁的时候，看到了一副精彩的插画",
	"zh-cn",
	"examples/xiaoxiao(edgetts).mp3",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"当我还只有六岁的时候，看到了一副精彩的插画",
	"zh-cn",
	"examples/jenny(edgetts).mp3",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"当我还只有六岁的时候，看到了一副精彩的插画",
	"zh-cn",
	"examples/xiaoni(edgetts).mp3",
	None,
	False,
	False,
	False,
	True,
	],
	[
	"当我还只有六岁的时候，看到了一副精彩的插画",
	"zh-cn",
	"examples/hsiaochen(edgetts).mp3",
	None,
	False,
	False,
	False,
	True,
	],
	]



	gr.Interface(
	fn=predict,
	inputs=[
	gr.Textbox(
	label="想要合成的文本内容",
	lines=3,
	placeholder="想说却还没说的还很多"
	),
	gr.Dropdown(
	label="请选择文本内容对应的语言",
	choices=[
	"en",
	"es",
	"fr",
	"de",
	"it",
	"pt",
	"pl",
	"tr",
	"ru",
	"nl",
	"cs",
	"ar",
	"zh-cn",
	"ja",
	"ko",
	"hu"
	],
	max_choices=1,
	value="zh-cn",
	),
	gr.Audio(
	label="通过文件上传语音",
	type="filepath",
	value="examples/female.wav",
	),
	gr.Audio(source="microphone",
	type="filepath",
	label="使用麦克风上传语音",
	info="移动端更稳定，电脑端可能无法上传",
	streaming=True,
	),
	gr.Checkbox(label="是否使用麦克风上传语音",
	value=False,
	info="默认为否",),
	gr.Checkbox(label="是否需要去除背景音",
	value=False,
	info="默认为否",
	),
	gr.Checkbox(label="不使用自动探测语言",
	value=False,
	info="勾选此选项则不使用自动探测语言",),
	gr.Checkbox(
	label="使用条款",
	value=True,
	info="我承诺：不会利用此程序生成对个人或组织造成侵害的任何内容",
	),


	],
	outputs=[
	gr.Video(label="为您合成的专属音频"),
	gr.Audio(label="Synthesised Audio", visible=False),
	gr.Audio(label="Reference Audio Used", visible=False),
	],
	title=title,
	description=description,
	article=article,
	examples=examples,
	).queue().launch(debug=True,show_api=False,server_name="0.0.0.0")