File size: 2,588 Bytes
0a48c54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33c4cc8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from pathlib import Path
from openai import OpenAI
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Initialize the OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

if not client.api_key:
    raise ValueError("Please set the OPENAI_API_KEY in your .env file")

def generate_versions(text):
    prompt = f"""Given the original text: "{text}"
    Generate two rephrased versions:
    1. A slightly more emotional version (ex. "μœ„ν—˜ν•΄μš”" -> "μœ„ν—˜ν•΄μš”!!")
    2. An exaggerated, highly emotional version (ex. "μœ„ν—˜ν•΄μš”" -> "μž κΉλ§Œμš”! μ•ˆλΌ, μœ„ν—˜ν•΄μš”!!")
    Output format:
    Original: [original text]
    Emotional: [emotional version]
    Exaggerated: [exaggerated version]"""

    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        stream=True,
    )

    full_response = ""
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            full_response += chunk.choices[0].delta.content

    versions = full_response.split('\n')
    return [v.split(': ', 1)[1] for v in versions if ': ' in v]

def text_to_speech(text):
    response = client.audio.speech.create(
        model="tts-1",
        voice="alloy",
        input=text
    )
    return response.content

def process_and_generate(text):
    versions = generate_versions(text)
    audio_contents = [text_to_speech(v) for v in versions]
    return versions + audio_contents + ["All versions generated successfully!"]

with gr.Blocks(title="Emotional TTS Comparison") as demo:
    gr.Markdown("# Emotional TTS Comparison")
    gr.Markdown("Enter text to generate three versions with varying emotional intensity.")
    
    input_text = gr.Textbox(label="Original Text", lines=3)
    generate_btn = gr.Button("Generate Versions and Speech")
    
    with gr.Row():
        text1 = gr.Textbox(label="Original Version")
        text2 = gr.Textbox(label="Emotional Version")
        text3 = gr.Textbox(label="Exaggerated Version")
    
    with gr.Row():
        audio1 = gr.Audio(label="Original Speech")
        audio2 = gr.Audio(label="Emotional Speech")
        audio3 = gr.Audio(label="Exaggerated Speech")
    
    status = gr.Textbox(label="Status")
    
    generate_btn.click(
        process_and_generate,
        inputs=[input_text],
        outputs=[text1, text2, text3, audio1, audio2, audio3, status]
    )

if __name__ == "__main__":
    demo.launch()
else:
    app = gr.mount_gradio_app(demo, "/")