|
import spaces |
|
import gradio as gr |
|
import torch |
|
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer |
|
from string import punctuation |
|
import re |
|
|
|
|
|
from parler_tts import ParlerTTSForConditionalGeneration |
|
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed |
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
repo_id = "PHBJT/french_parler_tts_mini_v0.1" |
|
|
|
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) |
|
tokenizer = AutoTokenizer.from_pretrained(repo_id) |
|
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id) |
|
|
|
|
|
SAMPLE_RATE = feature_extractor.sampling_rate |
|
SEED = 42 |
|
|
|
default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres." |
|
default_description = "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue." |
|
examples = [ |
|
[ |
|
"La voix humaine est un instrument de musique au-dessus de tous les autres.", |
|
"A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.", |
|
None, |
|
], |
|
[ |
|
"Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.", |
|
"A male voice delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice, creating a close-sounding audio experience.", |
|
None, |
|
], |
|
[ |
|
"La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.", |
|
"A male voice provides a monotone yet slightly fast delivery, with a very close recording that almost has no background noise.", |
|
None, |
|
], |
|
[ |
|
"Le progrès fait naître plus de besoins qu'il n'en satisfait.", |
|
"A female voice, in a very poor recording quality, delivers slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. The voice is slightly higher pitched than average.", |
|
None, |
|
], |
|
] |
|
number_normalizer = EnglishNumberNormalizer() |
|
|
|
def preprocess(text): |
|
text = number_normalizer(text).strip() |
|
text = text.replace("-", " ") |
|
if text[-1] not in punctuation: |
|
text = f"{text}." |
|
|
|
abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b' |
|
|
|
def separate_abb(chunk): |
|
chunk = chunk.replace(".","") |
|
print(chunk) |
|
return " ".join(chunk) |
|
|
|
abbreviations = re.findall(abbreviations_pattern, text) |
|
for abv in abbreviations: |
|
if abv in text: |
|
text = text.replace(abv, separate_abb(abv)) |
|
return text |
|
|
|
@spaces.GPU |
|
def gen_tts(text, description): |
|
inputs = tokenizer(description.strip(), return_tensors="pt").to(device) |
|
prompt = tokenizer(preprocess(text), return_tensors="pt").to(device) |
|
|
|
set_seed(SEED) |
|
generation = model.generate( |
|
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0 |
|
) |
|
audio_arr = generation.cpu().numpy().squeeze() |
|
|
|
return SAMPLE_RATE, audio_arr |
|
|
|
|
|
def extract_text(file): |
|
from pypdf import PdfReader |
|
reader = PdfReader(file) |
|
number_of_pages = len(reader.pages) |
|
text = ''.join(page.extract_text() for page in reader.pages[:10]) |
|
return text |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("""# PDF reader |
|
|
|
Un lecteur pdf construit avec [MeloTTS](https://github.com/myshell-ai/MeloTTS). |
|
|
|
### Comment l'utiliser ? |
|
|
|
1. Téléversez le document pdf à lire. |
|
2. Cliquez sur "Extraire le texte" pour extraire les 10 premières pages. |
|
3. Cliquez sur "Réciter le texte" pour générer l'audio.""") |
|
with gr.Group(): |
|
speaker_description = gr.Textbox(value='A male voice delivers a slightly expressive and animated speech with a quick speed. The recording features a low-pitch voice, creating a close-sounding audio experience.', label='Description de la voix') |
|
file = gr.File(label="Document à lire") |
|
btn_extract = gr.Button('Extraire le texte', variant='primary') |
|
text = gr.Textbox(label="Texte extrait") |
|
btn = gr.Button('Réciter le texte', variant='primary') |
|
audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out") |
|
btn_extract.click(extract_text, inputs=[file], outputs=[text]) |
|
btn.click(gen_tts, inputs=[text, speaker_description], outputs=[audio_out]) |
|
gr.Markdown('Demo by [m-ric](https://x.com/AymericRoucher).') |
|
|
|
|
|
demo.queue(api_open=True, default_concurrency_limit=10).launch(show_api=True, share=True) |