hex-rvc / app.py
Hev832's picture
Update app.py
3b67f5f verified
raw
history blame
23.4 kB
import os
import re
import random
from scipy.io.wavfile import write
from scipy.io.wavfile import read
import numpy as np
import gradio as gr
import yt_dlp
import subprocess
from pydub import AudioSegment
from audio_separator.separator import Separator
from lib.infer import infer_audio
import edge_tts
import tempfile
import anyio
language_dict = {
'English-Jenny (Female)': 'en-US-JennyNeural',
'English-Guy (Male)': 'en-US-GuyNeural',
'English-Ana (Female)': 'en-US-AnaNeural',
'English-Aria (Female)': 'en-US-AriaNeural',
'English-Christopher (Male)': 'en-US-ChristopherNeural',
'English-Eric (Male)': 'en-US-EricNeural',
'English-Michelle (Female)': 'en-US-MichelleNeural',
'English-Roger (Male)': 'en-US-RogerNeural',
'Spanish (Mexican)-Dalia (Female)': 'es-MX-DaliaNeural',
'Spanish (Mexican)-Jorge- (Male)': 'es-MX-JorgeNeural',
'Korean-Sun-Hi- (Female)': 'ko-KR-SunHiNeural',
'Korean-InJoon- (Male)': 'ko-KR-InJoonNeural',
'Thai-Premwadee- (Female)': 'th-TH-PremwadeeNeural',
'Thai-Niwat- (Male)': 'th-TH-NiwatNeural',
'Vietnamese-HoaiMy- (Female)': 'vi-VN-HoaiMyNeural',
'Vietnamese-NamMinh- (Male)': 'vi-VN-NamMinhNeural',
'Japanese-Nanami- (Female)': 'ja-JP-NanamiNeural',
'Japanese-Keita- (Male)': 'ja-JP-KeitaNeural',
'French-Denise- (Female)': 'fr-FR-DeniseNeural',
'French-Eloise- (Female)': 'fr-FR-EloiseNeural',
'French-Henri- (Male)': 'fr-FR-HenriNeural',
'Brazilian-Francisca- (Female)': 'pt-BR-FranciscaNeural',
'Brazilian-Antonio- (Male)': 'pt-BR-AntonioNeural',
'Indonesian-Ardi- (Male)': 'id-ID-ArdiNeural',
'Indonesian-Gadis- (Female)': 'id-ID-GadisNeural',
'Hebrew-Avri- (Male)': 'he-IL-AvriNeural',
'Hebrew-Hila- (Female)': 'he-IL-HilaNeural',
'Italian-Isabella- (Female)': 'it-IT-IsabellaNeural',
'Italian-Diego- (Male)': 'it-IT-DiegoNeural',
'Italian-Elsa- (Female)': 'it-IT-ElsaNeural',
'Dutch-Colette- (Female)': 'nl-NL-ColetteNeural',
'Dutch-Fenna- (Female)': 'nl-NL-FennaNeural',
'Dutch-Maarten- (Male)': 'nl-NL-MaartenNeural',
'Malese-Osman- (Male)': 'ms-MY-OsmanNeural',
'Malese-Yasmin- (Female)': 'ms-MY-YasminNeural',
'Norwegian-Pernille- (Female)': 'nb-NO-PernilleNeural',
'Norwegian-Finn- (Male)': 'nb-NO-FinnNeural',
'Swedish-Sofie- (Female)': 'sv-SE-SofieNeural',
'ArabicSwedish-Mattias- (Male)': 'sv-SE-MattiasNeural',
'Arabic-Hamed- (Male)': 'ar-SA-HamedNeural',
'Arabic-Zariyah- (Female)': 'ar-SA-ZariyahNeural',
'Greek-Athina- (Female)': 'el-GR-AthinaNeural',
'Greek-Nestoras- (Male)': 'el-GR-NestorasNeural',
'German-Katja- (Female)': 'de-DE-KatjaNeural',
'German-Amala- (Female)': 'de-DE-AmalaNeural',
'German-Conrad- (Male)': 'de-DE-ConradNeural',
'German-Killian- (Male)': 'de-DE-KillianNeural',
'Afrikaans-Adri- (Female)': 'af-ZA-AdriNeural',
'Afrikaans-Willem- (Male)': 'af-ZA-WillemNeural',
'Ethiopian-Ameha- (Male)': 'am-ET-AmehaNeural',
'Ethiopian-Mekdes- (Female)': 'am-ET-MekdesNeural',
'Arabic (UAD)-Fatima- (Female)': 'ar-AE-FatimaNeural',
'Arabic (UAD)-Hamdan- (Male)': 'ar-AE-HamdanNeural',
'Arabic (Bahrain)-Ali- (Male)': 'ar-BH-AliNeural',
'Arabic (Bahrain)-Laila- (Female)': 'ar-BH-LailaNeural',
'Arabic (Algeria)-Ismael- (Male)': 'ar-DZ-IsmaelNeural',
'Arabic (Egypt)-Salma- (Female)': 'ar-EG-SalmaNeural',
'Arabic (Egypt)-Shakir- (Male)': 'ar-EG-ShakirNeural',
'Arabic (Iraq)-Bassel- (Male)': 'ar-IQ-BasselNeural',
'Arabic (Iraq)-Rana- (Female)': 'ar-IQ-RanaNeural',
'Arabic (Jordan)-Sana- (Female)': 'ar-JO-SanaNeural',
'Arabic (Jordan)-Taim- (Male)': 'ar-JO-TaimNeural',
'Arabic (Kuwait)-Fahed- (Male)': 'ar-KW-FahedNeural',
'Arabic (Kuwait)-Noura- (Female)': 'ar-KW-NouraNeural',
'Arabic (Lebanon)-Layla- (Female)': 'ar-LB-LaylaNeural',
'Arabic (Lebanon)-Rami- (Male)': 'ar-LB-RamiNeural',
'Arabic (Libya)-Iman- (Female)': 'ar-LY-ImanNeural',
'Arabic (Libya)-Omar- (Male)': 'ar-LY-OmarNeural',
'Arabic (Morocco)-Jamal- (Male)': 'ar-MA-JamalNeural',
'Arabic (Morocco)-Mouna- (Female)': 'ar-MA-MounaNeural',
'Arabic (Oman)-Abdullah- (Male)': 'ar-OM-AbdullahNeural',
'Arabic (Oman)-Aysha- (Female)': 'ar-OM-AyshaNeural',
'Arabic (Qatar)-Amal- (Female)': 'ar-QA-AmalNeural',
'Arabic (Qatar)-Moaz- (Male)': 'ar-QA-MoazNeural',
'Arabic (Syrian Arab Republic)-Amany- (Female)': 'ar-SY-AmanyNeural',
'Arabic (Syrian Arab Republic)-Laith- (Male)': 'ar-SY-LaithNeural',
'Arabic (Tunisia)-Hedi- (Male)': 'ar-TN-HediNeural',
'Arabic (Tunisia)-Reem- (Female)': 'ar-TN-ReemNeural',
'Arabic (Yemen )-Maryam- (Female)': 'ar-YE-MaryamNeural',
'Arabic (Yemen )-Saleh- (Male)': 'ar-YE-SalehNeural',
'Azerbaijani-Babek- (Male)': 'az-AZ-BabekNeural',
'Azerbaijani-Banu- (Female)': 'az-AZ-BanuNeural',
'Bulgarian-Borislav- (Male)': 'bg-BG-BorislavNeural',
'Bulgarian-Kalina- (Female)': 'bg-BG-KalinaNeural',
'Bengali (Bangladesh)-Nabanita- (Female)': 'bn-BD-NabanitaNeural',
'Bengali (Bangladesh)-Pradeep- (Male)': 'bn-BD-PradeepNeural',
'Bengali (India)-Bashkar- (Male)': 'bn-IN-BashkarNeural',
'Bengali (India)-Tanishaa- (Female)': 'bn-IN-TanishaaNeural',
'Bosniak (Bosnia and Herzegovina)-Goran- (Male)': 'bs-BA-GoranNeural',
'Bosniak (Bosnia and Herzegovina)-Vesna- (Female)': 'bs-BA-VesnaNeural',
'Catalan (Spain)-Joana- (Female)': 'ca-ES-JoanaNeural',
'Catalan (Spain)-Enric- (Male)': 'ca-ES-EnricNeural',
'Czech (Czech Republic)-Antonin- (Male)': 'cs-CZ-AntoninNeural',
'Czech (Czech Republic)-Vlasta- (Female)': 'cs-CZ-VlastaNeural',
'Welsh (UK)-Aled- (Male)': 'cy-GB-AledNeural',
'Welsh (UK)-Nia- (Female)': 'cy-GB-NiaNeural',
'Danish (Denmark)-Christel- (Female)': 'da-DK-ChristelNeural',
'Danish (Denmark)-Jeppe- (Male)': 'da-DK-JeppeNeural',
'German (Austria)-Ingrid- (Female)': 'de-AT-IngridNeural',
'German (Austria)-Jonas- (Male)': 'de-AT-JonasNeural',
'German (Switzerland)-Jan- (Male)': 'de-CH-JanNeural',
'German (Switzerland)-Leni- (Female)': 'de-CH-LeniNeural',
'English (Australia)-Natasha- (Female)': 'en-AU-NatashaNeural',
'English (Australia)-William- (Male)': 'en-AU-WilliamNeural',
'English (Canada)-Clara- (Female)': 'en-CA-ClaraNeural',
'English (Canada)-Liam- (Male)': 'en-CA-LiamNeural',
'English (UK)-Libby- (Female)': 'en-GB-LibbyNeural',
'English (UK)-Maisie- (Female)': 'en-GB-MaisieNeural',
'English (UK)-Ryan- (Male)': 'en-GB-RyanNeural',
'English (UK)-Sonia- (Female)': 'en-GB-SoniaNeural',
'English (UK)-Thomas- (Male)': 'en-GB-ThomasNeural',
'English (Hong Kong)-Sam- (Male)': 'en-HK-SamNeural',
'English (Hong Kong)-Yan- (Female)': 'en-HK-YanNeural',
'English (Ireland)-Connor- (Male)': 'en-IE-ConnorNeural',
'English (Ireland)-Emily- (Female)': 'en-IE-EmilyNeural',
'English (India)-Neerja- (Female)': 'en-IN-NeerjaNeural',
'English (India)-Prabhat- (Male)': 'en-IN-PrabhatNeural',
'English (Kenya)-Asilia- (Female)': 'en-KE-AsiliaNeural',
'English (Kenya)-Chilemba- (Male)': 'en-KE-ChilembaNeural',
'English (Nigeria)-Abeo- (Male)': 'en-NG-AbeoNeural',
'English (Nigeria)-Ezinne- (Female)': 'en-NG-EzinneNeural',
'English (New Zealand)-Mitchell- (Male)': 'en-NZ-MitchellNeural',
'English (Philippines)-James- (Male)': 'en-PH-JamesNeural',
'English (Philippines)-Rosa- (Female)': 'en-PH-RosaNeural',
'English (Singapore)-Luna- (Female)': 'en-SG-LunaNeural',
'English (Singapore)-Wayne- (Male)': 'en-SG-WayneNeural',
'English (Tanzania)-Elimu- (Male)': 'en-TZ-ElimuNeural',
'English (Tanzania)-Imani- (Female)': 'en-TZ-ImaniNeural',
'English (South Africa)-Leah- (Female)': 'en-ZA-LeahNeural',
'English (South Africa)-Luke- (Male)': 'en-ZA-LukeNeural',
'Spanish (Argentina)-Elena- (Female)': 'es-AR-ElenaNeural',
'Spanish (Argentina)-Tomas- (Male)': 'es-AR-TomasNeural',
'Spanish (Bolivia)-Marcelo- (Male)': 'es-BO-MarceloNeural',
'Spanish (Bolivia)-Sofia- (Female)': 'es-BO-SofiaNeural',
'Spanish (Colombia)-Gonzalo- (Male)': 'es-CO-GonzaloNeural',
'Spanish (Colombia)-Salome- (Female)': 'es-CO-SalomeNeural',
'Spanish (Costa Rica)-Juan- (Male)': 'es-CR-JuanNeural',
'Spanish (Costa Rica)-Maria- (Female)': 'es-CR-MariaNeural',
'Spanish (Cuba)-Belkys- (Female)': 'es-CU-BelkysNeural',
'Spanish (Dominican Republic)-Emilio- (Male)': 'es-DO-EmilioNeural',
'Spanish (Dominican Republic)-Ramona- (Female)': 'es-DO-RamonaNeural',
'Spanish (Ecuador)-Andrea- (Female)': 'es-EC-AndreaNeural',
'Spanish (Ecuador)-Luis- (Male)': 'es-EC-LuisNeural',
'Spanish (Spain)-Alvaro- (Male)': 'es-ES-AlvaroNeural',
'Spanish (Spain)-Elvira- (Female)': 'es-ES-ElviraNeural',
'Spanish (Equatorial Guinea)-Teresa- (Female)': 'es-GQ-TeresaNeural',
'Spanish (Guatemala)-Andres- (Male)': 'es-GT-AndresNeural',
'Spanish (Guatemala)-Marta- (Female)': 'es-GT-MartaNeural',
'Spanish (Honduras)-Carlos- (Male)': 'es-HN-CarlosNeural',
'Spanish (Honduras)-Karla- (Female)': 'es-HN-KarlaNeural',
'Spanish (Nicaragua)-Federico- (Male)': 'es-NI-FedericoNeural',
'Spanish (Nicaragua)-Yolanda- (Female)': 'es-NI-YolandaNeural',
'Spanish (Panama)-Margarita- (Female)': 'es-PA-MargaritaNeural',
'Spanish (Panama)-Roberto- (Male)': 'es-PA-RobertoNeural',
'Spanish (Peru)-Alex- (Male)': 'es-PE-AlexNeural',
'Spanish (Peru)-Camila- (Female)': 'es-PE-CamilaNeural',
'Spanish (Puerto Rico)-Karina- (Female)': 'es-PR-KarinaNeural',
'Spanish (Puerto Rico)-Victor- (Male)': 'es-PR-VictorNeural',
'Spanish (Paraguay)-Mario- (Male)': 'es-PY-MarioNeural',
'Spanish (Paraguay)-Tania- (Female)': 'es-PY-TaniaNeural',
'Spanish (El Salvador)-Lorena- (Female)': 'es-SV-LorenaNeural',
'Spanish (El Salvador)-Rodrigo- (Male)': 'es-SV-RodrigoNeural',
'Spanish (United States)-Alonso- (Male)': 'es-US-AlonsoNeural',
'Spanish (United States)-Paloma- (Female)': 'es-US-PalomaNeural',
'Spanish (Uruguay)-Mateo- (Male)': 'es-UY-MateoNeural',
'Spanish (Uruguay)-Valentina- (Female)': 'es-UY-ValentinaNeural',
'Spanish (Venezuela)-Paola- (Female)': 'es-VE-PaolaNeural',
'Spanish (Venezuela)-Sebastian- (Male)': 'es-VE-SebastianNeural',
'Estonian (Estonia)-Anu- (Female)': 'et-EE-AnuNeural',
'Estonian (Estonia)-Kert- (Male)': 'et-EE-KertNeural',
'Persian (Iran)-Dilara- (Female)': 'fa-IR-DilaraNeural',
'Persian (Iran)-Farid- (Male)': 'fa-IR-FaridNeural',
'Finnish (Finland)-Harri- (Male)': 'fi-FI-HarriNeural',
'Finnish (Finland)-Noora- (Female)': 'fi-FI-NooraNeural',
'French (Belgium)-Charline- (Female)': 'fr-BE-CharlineNeural',
'French (Belgium)-Gerard- (Male)': 'fr-BE-GerardNeural',
'French (Canada)-Sylvie- (Female)': 'fr-CA-SylvieNeural',
'French (Canada)-Antoine- (Male)': 'fr-CA-AntoineNeural',
'French (Canada)-Jean- (Male)': 'fr-CA-JeanNeural',
'French (Switzerland)-Ariane- (Female)': 'fr-CH-ArianeNeural',
'French (Switzerland)-Fabrice- (Male)': 'fr-CH-FabriceNeural',
'Irish (Ireland)-Colm- (Male)': 'ga-IE-ColmNeural',
'Irish (Ireland)-Orla- (Female)': 'ga-IE-OrlaNeural',
'Galician (Spain)-Roi- (Male)': 'gl-ES-RoiNeural',
'Galician (Spain)-Sabela- (Female)': 'gl-ES-SabelaNeural',
'Gujarati (India)-Dhwani- (Female)': 'gu-IN-DhwaniNeural',
'Gujarati (India)-Niranjan- (Male)': 'gu-IN-NiranjanNeural',
'Hindi (India)-Madhur- (Male)': 'hi-IN-MadhurNeural',
'Hindi (India)-Swara- (Female)': 'hi-IN-SwaraNeural',
'Croatian (Croatia)-Gabrijela- (Female)': 'hr-HR-GabrijelaNeural',
'Croatian (Croatia)-Srecko- (Male)': 'hr-HR-SreckoNeural',
'Hungarian (Hungary)-Noemi- (Female)': 'hu-HU-NoemiNeural',
'Hungarian (Hungary)-Tamas- (Male)': 'hu-HU-TamasNeural',
'Icelandic (Iceland)-Gudrun- (Female)': 'is-IS-GudrunNeural',
'Icelandic (Iceland)-Gunnar- (Male)': 'is-IS-GunnarNeural',
'Javanese (Indonesia)-Dimas- (Male)': 'jv-ID-DimasNeural',
'Javanese (Indonesia)-Siti- (Female)': 'jv-ID-SitiNeural',
'Georgian (Georgia)-Eka- (Female)': 'ka-GE-EkaNeural',
'Georgian (Georgia)-Giorgi- (Male)': 'ka-GE-GiorgiNeural',
'Kazakh (Kazakhstan)-Aigul- (Female)': 'kk-KZ-AigulNeural',
'Kazakh (Kazakhstan)-Daulet- (Male)': 'kk-KZ-DauletNeural',
'Khmer (Cambodia)-Piseth- (Male)': 'km-KH-PisethNeural',
'Khmer (Cambodia)-Sreymom- (Female)': 'km-KH-SreymomNeural',
'Kannada (India)-Gagan- (Male)': 'kn-IN-GaganNeural',
'Kannada (India)-Sapna- (Female)': 'kn-IN-SapnaNeural',
'Lao (Laos)-Chanthavong- (Male)': 'lo-LA-ChanthavongNeural',
'Lao (Laos)-Keomany- (Female)': 'lo-LA-KeomanyNeural',
'Lithuanian (Lithuania)-Leonas- (Male)': 'lt-LT-LeonasNeural',
'Lithuanian (Lithuania)-Ona- (Female)': 'lt-LT-OnaNeural',
'Latvian (Latvia)-Everita- (Female)': 'lv-LV-EveritaNeural',
'Latvian (Latvia)-Nils- (Male)': 'lv-LV-NilsNeural',
'Macedonian (North Macedonia)-Aleksandar- (Male)': 'mk-MK-AleksandarNeural',
'Macedonian (North Macedonia)-Marija- (Female)': 'mk-MK-MarijaNeural',
'Malayalam (India)-Midhun- (Male)': 'ml-IN-MidhunNeural',
'Malayalam (India)-Sobhana- (Female)': 'ml-IN-SobhanaNeural',
'Mongolian (Mongolia)-Bataa- (Male)': 'mn-MN-BataaNeural',
'Mongolian (Mongolia)-Yesui- (Female)': 'mn-MN-YesuiNeural',
'Marathi (India)-Aarohi- (Female)': 'mr-IN-AarohiNeural',
'Marathi (India)-Manohar- (Male)': 'mr-IN-ManoharNeural',
'Maltese (Malta)-Grace- (Female)': 'mt-MT-GraceNeural',
'Maltese (Malta)-Joseph- (Male)': 'mt-MT-JosephNeural',
'Burmese (Myanmar)-Nilar- (Female)': 'my-MM-NilarNeural',
'Burmese (Myanmar)-Thiha- (Male)': 'my-MM-ThihaNeural',
'Nepali (Nepal)-Hemkala- (Female)': 'ne-NP-HemkalaNeural',
'Nepali (Nepal)-Sagar- (Male)': 'ne-NP-SagarNeural',
'Dutch (Belgium)-Arnaud- (Male)': 'nl-BE-ArnaudNeural',
'Dutch (Belgium)-Dena- (Female)': 'nl-BE-DenaNeural',
'Polish (Poland)-Marek- (Male)': 'pl-PL-MarekNeural',
'Polish (Poland)-Zofia- (Female)': 'pl-PL-ZofiaNeural',
'Pashto (Afghanistan)-Gul Nawaz- (Male)': 'ps-AF-Gul',
}
def download_audio(url):
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': 'ytdl/%(title)s.%(ext)s',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'wav',
'preferredquality': '192',
}],
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=True)
file_path = ydl.prepare_filename(info_dict).rsplit('.', 1)[0] + '.wav'
sample_rate, audio_data = read(file_path)
audio_array = np.asarray(audio_data, dtype=np.int16)
return sample_rate, audio_array
# Define a function to handle the entire separation process
def separate_audio(input_audio, output_dir, model_voc_inst, model_deecho, model_back_voc):
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
separator = Separator(output_dir=output_dir)
# Define output files
vocals = os.path.join(output_dir, 'Vocals.wav')
instrumental = os.path.join(output_dir, 'Instrumental.wav')
vocals_reverb = os.path.join(output_dir, 'Vocals (Reverb).wav')
vocals_no_reverb = os.path.join(output_dir, 'Vocals (No Reverb).wav')
lead_vocals = os.path.join(output_dir, 'Lead Vocals.wav')
backing_vocals = os.path.join(output_dir, 'Backing Vocals.wav')
# Splitting a track into Vocal and Instrumental
separator.load_model(model_filename=model_voc_inst)
voc_inst = separator.separate(input_audio)
os.rename(os.path.join(output_dir, voc_inst[0]), instrumental) # Rename to “Instrumental.wav”
os.rename(os.path.join(output_dir, voc_inst[1]), vocals) # Rename to “Vocals.wav”
# Applying DeEcho-DeReverb to Vocals
separator.load_model(model_filename=model_deecho)
voc_no_reverb = separator.separate(vocals)
os.rename(os.path.join(output_dir, voc_no_reverb[0]), vocals_no_reverb) # Rename to “Vocals (No Reverb).wav”
os.rename(os.path.join(output_dir, voc_no_reverb[1]), vocals_reverb) # Rename to “Vocals (Reverb).wav”
# Separating Back Vocals from Main Vocals
separator.load_model(model_filename=model_back_voc)
backing_voc = separator.separate(vocals_no_reverb)
os.rename(os.path.join(output_dir, backing_voc[0]), backing_vocals) # Rename to “Backing Vocals.wav”
os.rename(os.path.join(output_dir, backing_voc[1]), lead_vocals) # Rename to “Lead Vocals.wav”
return instrumental, vocals, vocals_reverb, vocals_no_reverb, lead_vocals, backing_vocals
# Main function to process audio (Inference)
def process_audio(MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE,
FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP,
KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio=None):
# If no sound path is given, use the uploaded file
if not SOUND_PATH and upload_audio is not None:
SOUND_PATH = os.path.join("uploaded_audio", upload_audio.name)
with open(SOUND_PATH, "wb") as f:
f.write(upload_audio.read())
# Check if a model name is provided
if not MODEL_NAME:
return "Please provide a model name."
# Run the inference
os.system("chmod +x stftpitchshift")
inferred_audio = infer_audio(
MODEL_NAME,
SOUND_PATH,
F0_CHANGE,
F0_METHOD,
MIN_PITCH,
MAX_PITCH,
CREPE_HOP_LENGTH,
INDEX_RATE,
FILTER_RADIUS,
RMS_MIX_RATE,
PROTECT,
SPLIT_INFER,
MIN_SILENCE,
SILENCE_THRESHOLD,
SEEK_STEP,
KEEP_SILENCE,
FORMANT_SHIFT,
QUEFRENCY,
TIMBRE,
F0_AUTOTUNE,
OUTPUT_FORMAT
)
return inferred_audio
async def text_to_speech_edge(text, language_code):
voice = language_dict.get(language_code, "default_voice")
communicate = edge_tts.Communicate(text, voice)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path
# Gradio Blocks Interface with Tabs
with gr.Blocks(title="Hex RVC") as app:
gr.Markdown("# Hex RVC")
with gr.Tab("Inference"):
with gr.Row():
MODEL_NAME = gr.Textbox(label="Model Name", placeholder="Enter model name")
SOUND_PATH = gr.Textbox(label="Audio Path (Optional)", placeholder="Leave blank to upload audio")
upload_audio = gr.File(label="Upload Audio", type='filepath', file_types=["audio"])
with gr.Row():
F0_CHANGE = gr.Number(label="Pitch Change (semitones)", value=0)
F0_METHOD = gr.Dropdown(choices=["crepe", "harvest", "mangio-crepe", "rmvpe", "rmvpe+", "fcpe",
"hybrid[mangio-crepe+rmvpe]", "hybrid[mangio-crepe+fcpe]",
"hybrid[rmvpe+fcpe]", "hybrid[mangio-crepe+rmvpe+fcpe]"],
label="F0 Method", value="fcpe")
with gr.Row():
MIN_PITCH = gr.Textbox(label="Min Pitch", value="50")
MAX_PITCH = gr.Textbox(label="Max Pitch", value="1100")
CREPE_HOP_LENGTH = gr.Number(label="Crepe Hop Length", value=120)
INDEX_RATE = gr.Slider(label="Index Rate", minimum=0, maximum=1, value=0.75)
FILTER_RADIUS = gr.Number(label="Filter Radius", value=3)
RMS_MIX_RATE = gr.Slider(label="RMS Mix Rate", minimum=0, maximum=1, value=0.25)
PROTECT = gr.Slider(label="Protect", minimum=0, maximum=1, value=0.33)
with gr.Accordion("Hex TTS"):
input_text = gr.Textbox(lines=5, label="Input Text")
#output_text = gr.Textbox(label="Output Text")
#output_audio = gr.Audio(type="filepath", label="Exported Audio")
language = gr.Dropdown(choices=list(language_dict.keys()), label="Choose the Voice Model")
tts_convert = gr.Button("Convert")
tts_convert.click(fn=text_to_speech_edge inputs=[input_text, language], output=upload_audio)
with gr.Accordion("Advanced Settings", open=False):
SPLIT_INFER = gr.Checkbox(label="Enable Split Inference", value=False)
MIN_SILENCE = gr.Number(label="Min Silence (ms)", value=500)
SILENCE_THRESHOLD = gr.Number(label="Silence Threshold (dBFS)", value=-50)
SEEK_STEP = gr.Slider(label="Seek Step (ms)", minimum=1, maximum=10, value=1)
KEEP_SILENCE = gr.Number(label="Keep Silence (ms)", value=200)
FORMANT_SHIFT = gr.Checkbox(label="Enable Formant Shift", value=False)
QUEFRENCY = gr.Number(label="Quefrency", value=0)
TIMBRE = gr.Number(label="Timbre", value=1)
F0_AUTOTUNE = gr.Checkbox(label="Enable F0 Autotune", value=False)
OUTPUT_FORMAT = gr.Dropdown(choices=["wav", "flac", "mp3"], label="Output Format", value="wav")
run_button = gr.Button("Run Inference")
output_audio = gr.Audio(label="Generated Audio", type='filepath')
run_button.click(
process_audio,
inputs=[MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE,
FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP,
KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio],
outputs=output_audio
)
with gr.Tab("Audio Separation"):
with gr.Row():
input_audio = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
output_dir = gr.Textbox(value="/content/output", label="Output Directory")
with gr.Accordion("Separation by Link", open = False):
with gr.Row():
roformer_link = gr.Textbox(
label = "Link",
placeholder = "Paste the link here",
interactive = True
)
with gr.Row():
gr.Markdown("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)")
with gr.Row():
roformer_download_button = gr.Button(
"Download!",
variant = "primary"
)
roformer_download_button.click(download_audio, [roformer_link], [input_audio])
with gr.Row():
model_voc_inst = gr.Textbox(value='model_bs_roformer_ep_317_sdr_12.9755.ckpt', label="Vocal & Instrumental Model")
model_deecho = gr.Textbox(value='UVR-DeEcho-DeReverb.pth', label="DeEcho-DeReverb Model")
model_back_voc = gr.Textbox(value='mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt', label="Backing Vocals Model")
separate_button = gr.Button("Separate Audio")
with gr.Row():
instrumental_out = gr.Audio(label="Instrumental")
vocals_out = gr.Audio(label="Vocals")
vocals_reverb_out = gr.Audio(label="Vocals (Reverb)")
vocals_no_reverb_out = gr.Audio(label="Vocals (No Reverb)")
lead_vocals_out = gr.Audio(label="Lead Vocals")
backing_vocals_out = gr.Audio(label="Backing Vocals")
separate_button.click(
separate_audio,
inputs=[input_audio, output_dir, model_voc_inst, model_deecho, model_back_voc],
outputs=[instrumental_out, vocals_out, vocals_reverb_out, vocals_no_reverb_out, lead_vocals_out, backing_vocals_out]
)
# Launch the Gradio app
app.launch()