|
import sys
|
|
import time
|
|
import os
|
|
import logging
|
|
|
|
import gradio as gr
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pypinyin import lazy_pinyin
|
|
from i18n import gettext, Translate
|
|
|
|
from api import generate_api, get_audio, generate_voice, load_characters_csv
|
|
from utils import get_length
|
|
|
|
|
|
trans_file = os.path.join(os.path.dirname(__file__), "i18n", "translations.json")
|
|
|
|
|
|
logging.getLogger("aiohttp").setLevel(logging.WARNING)
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
)
|
|
|
|
|
|
header = """header"""
|
|
|
|
terms = "terms"
|
|
|
|
|
|
def update_all_characters(lang, category):
|
|
new_characters, category = load_characters_csv(lang)
|
|
initial_characters = get_characters(kind=category[0], all_characters=new_characters)
|
|
return (
|
|
new_characters,
|
|
initial_characters,
|
|
gr.Gallery(
|
|
value=[[char["头像"], char["名称"]] for char in initial_characters],
|
|
show_label=False,
|
|
elem_id="character_gallery",
|
|
columns=[11],
|
|
object_fit="contain",
|
|
height="auto",
|
|
interactive=False,
|
|
allow_preview=False,
|
|
selected_index=None,
|
|
),
|
|
category,
|
|
gr.update(choices=category, value=category[0]),
|
|
)
|
|
|
|
|
|
def get_characters(
|
|
query=None, page=1, per_page=400, kind="原神", lang="zh", all_characters=None
|
|
):
|
|
|
|
filtered_characters = all_characters[all_characters["类别"] == kind]
|
|
|
|
if query:
|
|
|
|
filtered_characters = filtered_characters[
|
|
filtered_characters["名称"].str.contains(query, case=False)
|
|
]
|
|
if filtered_characters.empty and lang == "zh":
|
|
filtered_characters = all_characters[all_characters["类别"] == kind]
|
|
filtered_characters = filtered_characters[
|
|
filtered_characters["名称"]
|
|
.apply(lambda x: "".join(lazy_pinyin(x)))
|
|
.str.contains(query, case=False)
|
|
]
|
|
|
|
|
|
unique_characters = (
|
|
filtered_characters.groupby("名称").first().reset_index().sort_values(by="id")
|
|
)
|
|
|
|
|
|
import pickle
|
|
|
|
def process_avatar(avatar):
|
|
if not isinstance(avatar, str):
|
|
try:
|
|
return pickle.loads(bytes(avatar))
|
|
except:
|
|
return avatar
|
|
return avatar
|
|
|
|
unique_characters["头像"] = unique_characters["头像"].apply(process_avatar)
|
|
|
|
|
|
start_index = (page - 1) * per_page
|
|
end_index = start_index + per_page
|
|
|
|
return unique_characters.iloc[start_index:end_index].to_dict("records")
|
|
|
|
|
|
async def generate(selected_character=None, selected_characters=[], text="", lang="zh"):
|
|
if selected_character:
|
|
characters = [selected_character] + selected_characters
|
|
else:
|
|
characters = selected_characters
|
|
if not selected_character and not selected_characters:
|
|
if lang == "zh":
|
|
raise gr.Error("请先选择一个角色")
|
|
elif lang == "en":
|
|
raise gr.Error("Please select a character first")
|
|
elif lang == "ja":
|
|
raise gr.Error("まず、キャラクターを選択してください")
|
|
elif lang == "ko":
|
|
raise gr.Error("먼저 캐릭터를 선택하세요")
|
|
voice_ids = [char.get("voice_id") for char in characters if char.get("voice_id")]
|
|
|
|
if not voice_ids:
|
|
raise gr.Error("所选角色没有关联的 voice_id")
|
|
|
|
start_time = time.time()
|
|
|
|
if voice_ids == "1":
|
|
if lang == "zh":
|
|
raise gr.Error("该角色暂未创建语音")
|
|
elif lang == "en":
|
|
raise gr.Error("The character has not been created yet")
|
|
elif lang == "ja":
|
|
raise gr.Error("そのキャラクターの音声はまだ作成されていません")
|
|
elif lang == "ko":
|
|
raise gr.Error("해당 캐릭터의 음성이 아직 생성되지 않았습니다")
|
|
|
|
if text == "":
|
|
if lang == "zh":
|
|
raise gr.Error("请输入需要合成的文本")
|
|
elif lang == "en":
|
|
raise gr.Error("Please enter the text to be synthesized")
|
|
elif lang == "ja":
|
|
raise gr.Error("合成するテキストを入力してください")
|
|
elif lang == "ko":
|
|
raise gr.Error("합성할 텍스트를 입력하세요")
|
|
|
|
if get_length(text) > 1024:
|
|
if lang == "zh":
|
|
raise gr.Error("长度请控制在1024个字符以内")
|
|
elif lang == "en":
|
|
raise gr.Error("The text length exceeds 1024 words")
|
|
elif lang == "ja":
|
|
raise gr.Error("テキストの長さが1024文字を超えています")
|
|
elif lang == "ko":
|
|
raise gr.Error("텍스트 길이가 1024자를 초과합니다")
|
|
|
|
audio = await generate_api(voice_ids, text)
|
|
end_time = time.time()
|
|
if lang == "zh":
|
|
cost_time = f"合成共花费{end_time - start_time:.2f}秒"
|
|
elif lang == "en":
|
|
cost_time = (
|
|
f"Total time spent synthesizing: {end_time - start_time:.2f} seconds"
|
|
)
|
|
elif lang == "ja":
|
|
cost_time = f"合成にかかった時間: {end_time - start_time:.2f}秒"
|
|
elif lang == "ko":
|
|
cost_time = f"합성에 소요된 시간: {end_time - start_time:.2f}초"
|
|
if isinstance(audio, str):
|
|
print(audio)
|
|
raise gr.Error(audio)
|
|
else:
|
|
return audio, cost_time
|
|
|
|
|
|
def get_character_emotions(character, all_characters):
|
|
|
|
character_records = all_characters[all_characters["名称"] == character["名称"]]
|
|
|
|
|
|
character_infos = character_records.drop_duplicates(subset=["情绪"]).to_dict(
|
|
"records"
|
|
)
|
|
|
|
|
|
return (
|
|
character_infos
|
|
if character_infos
|
|
else [{"名称": character["名称"], "情绪": "默认情绪"}]
|
|
)
|
|
|
|
|
|
def update_character_info(character_name, emotion, current_character, all_characters):
|
|
character_info = None
|
|
if character_name and emotion:
|
|
character_info = all_characters[
|
|
(all_characters["名称"] == character_name)
|
|
& (all_characters["情绪"] == emotion)
|
|
]
|
|
if character_name == "":
|
|
return None
|
|
character_info = character_info.iloc[0].to_dict()
|
|
return character_info, all_characters
|
|
|
|
|
|
def add_new_voice(current_character, selected_characters, kind, lang, all_characters):
|
|
if not current_character:
|
|
if lang == "zh":
|
|
raise gr.Error("请先选择一个角色")
|
|
elif lang == "en":
|
|
raise gr.Error("Please select a character first")
|
|
elif lang == "ja":
|
|
raise gr.Error("まず、キャラクターを選択してください")
|
|
elif lang == "ko":
|
|
raise gr.Error("먼저 캐릭터를 선택하세요")
|
|
|
|
if len(selected_characters) >= 5:
|
|
raise gr.Error("已达到最大选择数(5个)")
|
|
|
|
|
|
existing_char = next(
|
|
(
|
|
char
|
|
for char in selected_characters
|
|
if char["名称"] == current_character["名称"]
|
|
),
|
|
None,
|
|
)
|
|
if existing_char:
|
|
|
|
if existing_char["情绪"] != current_character["情绪"]:
|
|
existing_char["情绪"] = current_character["情绪"]
|
|
else:
|
|
selected_characters.insert(0, current_character)
|
|
|
|
updated_characters = get_characters(
|
|
kind=kind, lang=lang, all_characters=all_characters
|
|
)
|
|
|
|
updated_gallery = gr.Gallery(
|
|
value=[[char["头像"], char["名称"]] for char in updated_characters],
|
|
show_label=False,
|
|
elem_id="character_gallery",
|
|
columns=[11],
|
|
object_fit="contain",
|
|
height="auto",
|
|
interactive=False,
|
|
allow_preview=False,
|
|
selected_index=None,
|
|
)
|
|
|
|
return (
|
|
None,
|
|
gr.update(value=""),
|
|
gr.update(choices=[]),
|
|
selected_characters,
|
|
updated_characters,
|
|
updated_gallery,
|
|
gr.update(visible=True),
|
|
all_characters,
|
|
)
|
|
|
|
|
|
def update_selected_chars_display(selected_characters):
|
|
updates = []
|
|
for i, (name, emotion, _, row) in enumerate(selected_chars_rows):
|
|
if i < len(selected_characters):
|
|
char = selected_characters[i]
|
|
updates.extend(
|
|
[
|
|
gr.update(value=char["名称"], visible=True),
|
|
gr.update(value=char["情绪"], visible=True),
|
|
gr.update(visible=True),
|
|
gr.update(visible=True),
|
|
]
|
|
)
|
|
else:
|
|
updates.extend(
|
|
[
|
|
gr.update(value="", visible=False),
|
|
gr.update(value="", visible=False),
|
|
gr.update(visible=False),
|
|
gr.update(visible=False),
|
|
]
|
|
)
|
|
return updates
|
|
|
|
|
|
def remove_character(index, selected_characters):
|
|
if 0 <= index < len(selected_characters):
|
|
del selected_characters[index]
|
|
return selected_characters, gr.update(visible=True)
|
|
|
|
|
|
def update_gallery(kind, query, all_characters):
|
|
updated_characters = get_characters(
|
|
kind=kind, query=query, lang=lang, all_characters=all_characters
|
|
)
|
|
return (
|
|
updated_characters,
|
|
[[char["头像"], char["名称"]] for char in updated_characters],
|
|
all_characters,
|
|
)
|
|
|
|
|
|
def on_select(evt: gr.SelectData, characters, selected_characters, all_characters):
|
|
|
|
if len(selected_characters) == 0:
|
|
selected_characters = []
|
|
|
|
selected = characters[evt.index]
|
|
emotions = get_character_emotions(selected, all_characters)
|
|
normal_index = 0
|
|
for index, emotion in enumerate(emotions):
|
|
if (
|
|
emotion["情绪"] == "正常"
|
|
or emotion["情绪"] == "보통"
|
|
or emotion["情绪"] == "normal"
|
|
):
|
|
normal_index = index
|
|
break
|
|
|
|
default_emotion = emotions[normal_index]["情绪"] if emotions else ""
|
|
default_voice_id = emotions[normal_index]["voice_id"] if emotions else ""
|
|
|
|
character_dict = selected.copy()
|
|
character_dict["情绪"] = default_emotion
|
|
character_dict["voice_id"] = default_voice_id
|
|
return (
|
|
selected["名称"],
|
|
gr.Dropdown(
|
|
choices=[emotion["情绪"] for emotion in emotions], value=default_emotion
|
|
),
|
|
character_dict,
|
|
selected_characters,
|
|
)
|
|
|
|
|
|
async def update_prompt_audio(current_character):
|
|
if current_character:
|
|
return await get_audio(current_character.get("voice_id"))
|
|
else:
|
|
return None
|
|
|
|
|
|
async def create_voice(
|
|
avatar, name, emotion, tags, gender, audio_data, lang, since_last_update
|
|
):
|
|
if since_last_update is None:
|
|
since_last_update = time.time()
|
|
elif time.time() - since_last_update < 30:
|
|
if lang == "zh":
|
|
gr.Warning(
|
|
f"已提交上个创建请求,请在{30 - (time.time() - since_last_update):.1f}秒后提交新的角色"
|
|
)
|
|
elif lang == "en":
|
|
gr.Warning(
|
|
f"The last creation request has been submitted. Please try to create a new character after {30 - (time.time() - since_last_update):.1f} seconds"
|
|
)
|
|
elif lang == "ja":
|
|
gr.Warning(
|
|
f"前回の作成リクエストが送信されました。{30 - (time.time() - since_last_update):.1f}秒後に新しいキャラクターを作成してください"
|
|
)
|
|
elif lang == "ko":
|
|
gr.Warning(
|
|
f"이전 생성 요청이 제출되었습니다. {30 - (time.time() - since_last_update):.1f}초 후에 새 캐릭터를 만들어주세요"
|
|
)
|
|
return avatar, name, emotion, tags, gender, audio_data, since_last_update
|
|
updates = {}
|
|
for field, value in [
|
|
("avatar", avatar),
|
|
("name", name),
|
|
("emotion", emotion),
|
|
("tags", tags),
|
|
("gender", gender),
|
|
("audio_data", audio_data),
|
|
]:
|
|
if field in ["avatar", "audio_data"]:
|
|
if value is None or (isinstance(value, np.ndarray) and value.size == 0):
|
|
updates[field] = gr.update(value=None)
|
|
elif value == "":
|
|
updates[field] = gr.update(value="")
|
|
|
|
if updates:
|
|
if lang == "zh":
|
|
gr.Warning("请填写完整信息")
|
|
elif lang == "en":
|
|
gr.Warning("Please fill in all the information")
|
|
elif lang == "ja":
|
|
gr.Warning("すべての情報を入力してください")
|
|
elif lang == "ko":
|
|
gr.Warning("모든 정보를 입력하세요")
|
|
return tuple(
|
|
[updates.get(field, gr.update())
|
|
for field in ["avatar", "name", "emotion", "tags", "gender", "audio_data"]] + [since_last_update]
|
|
)
|
|
duration = len(audio_data[1]) / audio_data[0]
|
|
if duration < 3.2 or duration > 8:
|
|
if lang == "zh":
|
|
gr.Warning("音频时长请控制在3.2-8秒之间")
|
|
elif lang == "en":
|
|
gr.Warning("The audio duration should be between 3.2 and 8 seconds")
|
|
elif lang == "ja":
|
|
gr.Warning("音声の長さは3.2秒から8秒の間にしてください")
|
|
elif lang == "ko":
|
|
gr.Warning("음성 길이는 3.2초에서 8초 사이로 설정해야 합니다")
|
|
return avatar, name, emotion, tags, gender, audio_data, since_last_update
|
|
await generate_voice(avatar, name, emotion, tags, gender, audio_data, lang)
|
|
if lang == "zh":
|
|
gr.Info("创建成功,您创建的语音将在审核后上线", duration=20)
|
|
elif lang == "en":
|
|
gr.Info(
|
|
"Creation successful. The voice you created will be available after review.",
|
|
duration=20,
|
|
)
|
|
elif lang == "ja":
|
|
gr.Info(
|
|
"作成が完了しました。作成された音声は審査後に公開されます。", duration=20
|
|
)
|
|
elif lang == "ko":
|
|
gr.Info(
|
|
"생성 완료. 귀하가 생성한 음성은 검토 후 공개될 예정입니다.", duration=20
|
|
)
|
|
return tuple([gr.update(value=None) for _ in range(6)] + [since_last_update])
|
|
|
|
|
|
head = """
|
|
<title>Free Online Text to Speech (TTS) | Convert Text to Audio</title>
|
|
<meta name="description" content="Text to Speech(TTS) for free! 5-second voice cloning, no sign-up required.">
|
|
<meta name="keywords" content="text to speech, TTS, free TTS, online TTS, speech synthesis, voice generator">
|
|
"""
|
|
with gr.Blocks(title="Online Free TTS", theme=gr.themes.Soft(), head=head) as demo:
|
|
gr.Markdown(
|
|
"Online Free TTS(Text-to-Speech). Ultra-low latency, 5-second voice cloning."
|
|
)
|
|
lang = gr.Radio(
|
|
choices=[("中文", "zh"), ("English", "en"), ("日本語", "ja"), ("한국인", "ko")],
|
|
label=gettext("Language"),
|
|
value="en",
|
|
scale=1,
|
|
)
|
|
all_characters_state = gr.State(load_characters_csv("en")[0])
|
|
category = gr.State(load_characters_csv("en")[1])
|
|
|
|
with Translate(trans_file, lang, placeholder_langs=["en", "zh", "ja", "ko"]):
|
|
gr.Markdown(value=gettext(header))
|
|
with gr.Group():
|
|
initial_characters = get_characters(
|
|
kind="原神", lang="zh", all_characters=all_characters_state.value
|
|
)
|
|
characters = gr.State(initial_characters)
|
|
selected_characters = gr.State([])
|
|
current_character = gr.State(None)
|
|
|
|
with gr.Tab(gettext("Synthesis Voice")):
|
|
with gr.Blocks():
|
|
with gr.Row():
|
|
kind = gr.Dropdown(
|
|
choices=category.value,
|
|
value=category.value[0],
|
|
label=gettext("Select character category"),
|
|
)
|
|
query = gr.Textbox(
|
|
label=gettext("Search character"),
|
|
value="",
|
|
lines=1,
|
|
max_lines=1,
|
|
interactive=True,
|
|
)
|
|
with gr.Blocks():
|
|
gallery = gr.Gallery(
|
|
value=[
|
|
[char["头像"], char["名称"]] for char in characters.value
|
|
],
|
|
show_label=False,
|
|
elem_id="character_gallery",
|
|
columns=[11],
|
|
object_fit="contain",
|
|
height="auto",
|
|
interactive=False,
|
|
allow_preview=False,
|
|
selected_index=None,
|
|
)
|
|
with gr.Row():
|
|
character_name = gr.Textbox(
|
|
label=gettext("Currently selected character"),
|
|
interactive=False,
|
|
max_lines=1,
|
|
)
|
|
info_type = gr.Dropdown(choices=[], label=gettext("Select emotion"))
|
|
with gr.Row():
|
|
add_voice_button = gr.Button(
|
|
gettext("Add new voice"), variant="primary"
|
|
)
|
|
|
|
selected_chars_container = gr.Column(
|
|
elem_id="selected_chars_container", visible=False
|
|
)
|
|
|
|
with selected_chars_container:
|
|
gr.Markdown(gettext("### Selected characters"))
|
|
selected_chars_rows = []
|
|
for i in range(5):
|
|
with gr.Row() as row:
|
|
name = gr.Textbox(
|
|
label=gettext("Name"), interactive=False, max_lines=1
|
|
)
|
|
emotion = gr.Textbox(
|
|
label=gettext("Emotion"), interactive=False, max_lines=1
|
|
)
|
|
delete_btn = gr.Button(gettext("Delete"), scale=0)
|
|
selected_chars_rows.append((name, emotion, delete_btn, row))
|
|
|
|
with gr.Row():
|
|
with gr.Column():
|
|
text = gr.Textbox(
|
|
label=gettext("Text to synthesize"),
|
|
value="",
|
|
lines=10,
|
|
max_lines=10,
|
|
)
|
|
inference_button = gr.Button(
|
|
gettext("🎉 Synthesize Voice 🎉"), variant="primary", size="lg"
|
|
)
|
|
with gr.Column():
|
|
prompt_audio = gr.Audio(
|
|
label=gettext("Reference audio for synthesis"),
|
|
interactive=False,
|
|
type="numpy",
|
|
)
|
|
output = gr.Audio(
|
|
label=gettext("Output audio"), interactive=False, type="numpy"
|
|
)
|
|
cost_time = gr.Textbox(
|
|
label=gettext("Synthesis time"),
|
|
interactive=False,
|
|
show_label=False,
|
|
max_lines=1,
|
|
)
|
|
try:
|
|
inference_button.click(
|
|
fn=generate,
|
|
inputs=[current_character, selected_characters, text, lang],
|
|
outputs=[output, cost_time],
|
|
)
|
|
except gr.Error as e:
|
|
gr.Error(e)
|
|
except Exception as e:
|
|
pass
|
|
|
|
with gr.Tab(gettext("Create Voice")):
|
|
since_last_update = gr.State(None)
|
|
gr.Markdown(gettext("Note"))
|
|
with gr.Row():
|
|
avatar = gr.Image(
|
|
label=gettext("Avatar"),
|
|
interactive=True,
|
|
type="pil",
|
|
image_mode="RGBA",
|
|
)
|
|
with gr.Column():
|
|
with gr.Row():
|
|
name = gr.Textbox(
|
|
label=gettext("Name"), interactive=True, max_lines=1
|
|
)
|
|
emotion = gr.Textbox(
|
|
label=gettext("Emotion\n(Happy, Sad, Angry)"),
|
|
interactive=True,
|
|
max_lines=1,
|
|
)
|
|
tags = gr.Textbox(
|
|
label=gettext("Tags\n(Genshin, Cute, Girl, Boy, etc.)"),
|
|
interactive=True,
|
|
max_lines=1,
|
|
)
|
|
gender = gr.Dropdown(
|
|
label=gettext("Gender"),
|
|
choices=[
|
|
(gettext("Male"), "male"),
|
|
(gettext("Female"), "female"),
|
|
(gettext("Non-Binary"), "non-binary"),
|
|
],
|
|
interactive=True,
|
|
)
|
|
audio_data = gr.Audio(
|
|
label=gettext("Prompt Audio(min 3.2s, max 8s)"),
|
|
interactive=True,
|
|
)
|
|
create_button = gr.Button(
|
|
gettext("Create Voice"), variant="primary"
|
|
)
|
|
|
|
gr.Markdown(gettext(terms))
|
|
|
|
|
|
lang.change(
|
|
fn=update_all_characters,
|
|
inputs=[lang, category],
|
|
outputs=[all_characters_state, characters, gallery, category, kind],
|
|
)
|
|
|
|
demo.load(
|
|
update_all_characters,
|
|
inputs=[lang, category],
|
|
outputs=[all_characters_state, characters, gallery, category, kind],
|
|
)
|
|
|
|
add_voice_button.click(
|
|
fn=add_new_voice,
|
|
inputs=[
|
|
current_character,
|
|
selected_characters,
|
|
kind,
|
|
lang,
|
|
all_characters_state,
|
|
],
|
|
outputs=[
|
|
current_character,
|
|
character_name,
|
|
info_type,
|
|
selected_characters,
|
|
characters,
|
|
gallery,
|
|
selected_chars_container,
|
|
all_characters_state,
|
|
],
|
|
).then(
|
|
fn=update_selected_chars_display,
|
|
inputs=[selected_characters],
|
|
outputs=[item for row in selected_chars_rows for item in row],
|
|
)
|
|
|
|
gallery.select(
|
|
fn=on_select,
|
|
inputs=[characters, selected_characters, all_characters_state],
|
|
outputs=[character_name, info_type, current_character, selected_characters],
|
|
).then(
|
|
fn=update_prompt_audio, inputs=[current_character], outputs=[prompt_audio]
|
|
)
|
|
|
|
info_type.change(
|
|
fn=update_character_info,
|
|
inputs=[character_name, info_type, current_character, all_characters_state],
|
|
outputs=[current_character, all_characters_state],
|
|
).then(
|
|
fn=update_prompt_audio, inputs=[current_character], outputs=[prompt_audio]
|
|
)
|
|
|
|
for i, (_, _, delete_btn, _) in enumerate(selected_chars_rows):
|
|
delete_btn.click(
|
|
fn=remove_character,
|
|
inputs=[gr.Number(value=i, visible=False), selected_characters],
|
|
outputs=[selected_characters, selected_chars_container],
|
|
).then(
|
|
fn=update_selected_chars_display,
|
|
inputs=[selected_characters],
|
|
outputs=[item for row in selected_chars_rows for item in row],
|
|
)
|
|
|
|
kind.change(
|
|
fn=update_gallery,
|
|
inputs=[kind, query, all_characters_state],
|
|
outputs=[characters, gallery, all_characters_state],
|
|
)
|
|
|
|
query.change(
|
|
fn=update_gallery,
|
|
inputs=[kind, query, all_characters_state],
|
|
outputs=[characters, gallery, all_characters_state],
|
|
)
|
|
|
|
create_button.click(
|
|
fn=create_voice,
|
|
inputs=[
|
|
avatar,
|
|
name,
|
|
emotion,
|
|
tags,
|
|
gender,
|
|
audio_data,
|
|
lang,
|
|
since_last_update,
|
|
],
|
|
outputs=[
|
|
avatar,
|
|
name,
|
|
emotion,
|
|
tags,
|
|
gender,
|
|
audio_data,
|
|
since_last_update,
|
|
],
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
demo.queue(default_concurrency_limit=None).launch(
|
|
show_api=False
|
|
)
|
|
|