|
import os |
|
import gradio as gr |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
from sentence_transformers import SentenceTransformer |
|
import numpy as np |
|
from gtts import gTTS |
|
import tempfile |
|
import logging |
|
from typing import Optional, Tuple |
|
import os |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
os.environ.pop("HTTP_PROXY", None) |
|
os.environ.pop("HTTPS_PROXY", None) |
|
os.environ.pop("http_proxy", None) |
|
os.environ.pop("https_proxy", None) |
|
|
|
|
|
def create_temp_dir(): |
|
"""Create temporary directory for audio files if it doesn't exist""" |
|
temp_dir = "temp_audio" |
|
if not os.path.exists(temp_dir): |
|
os.makedirs(temp_dir) |
|
return temp_dir |
|
|
|
class EnhancedAIAgent: |
|
def __init__(self): |
|
"""Initialize the AI agent with models and pipelines""" |
|
try: |
|
logger.info("Initializing AI Agent...") |
|
|
|
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
logger.info(f"Using device: {self.device}") |
|
|
|
|
|
logger.info("Loading speech recognition model...") |
|
self.speech_recognizer = pipeline( |
|
"automatic-speech-recognition", |
|
model="facebook/wav2vec2-base-960h", |
|
device=self.device |
|
) |
|
|
|
|
|
logger.info("Loading sentiment analysis model...") |
|
self.sentiment_analyzer = pipeline( |
|
"sentiment-analysis", |
|
model="distilbert-base-uncased-finetuned-sst-2-english", |
|
device=self.device |
|
) |
|
|
|
|
|
logger.info("Loading emotion recognition model...") |
|
self.emotion_recognizer = pipeline( |
|
"text-classification", |
|
model="j-hartmann/emotion-english-distilroberta-base", |
|
device=self.device |
|
) |
|
|
|
|
|
self.temp_dir = create_temp_dir() |
|
|
|
logger.info("AI Agent initialized successfully") |
|
|
|
except Exception as e: |
|
logger.error(f"Error initializing AI Agent: {str(e)}") |
|
raise |
|
|
|
def process_audio( |
|
self, |
|
audio_path: Optional[str], |
|
history: Optional[str] |
|
) -> Tuple[str, Optional[str], str, str]: |
|
"""Process audio input and generate response""" |
|
try: |
|
|
|
history_list = [] if not history else history.split('\n') |
|
|
|
|
|
if not audio_path: |
|
return history or "", None, "", "" |
|
|
|
|
|
logger.info("Converting speech to text...") |
|
user_input = self.speech_recognizer(audio_path)["text"] |
|
logger.info(f"Transcribed text: {user_input}") |
|
|
|
|
|
response = f"I heard you say: {user_input}" |
|
|
|
|
|
logger.info("Analyzing sentiment...") |
|
sentiment_result = self.sentiment_analyzer(user_input)[0] |
|
sentiment = f"Sentiment: {sentiment_result['label']} ({sentiment_result['score']:.2f})" |
|
|
|
|
|
logger.info("Analyzing emotion...") |
|
emotion_result = self.emotion_recognizer(user_input)[0] |
|
emotion = f"Emotion: {emotion_result['label']} ({emotion_result['score']:.2f})" |
|
|
|
|
|
logger.info("Generating audio response...") |
|
audio_output = self.text_to_speech(response) |
|
|
|
|
|
history_list.extend([ |
|
f"User: {user_input}", |
|
f"Assistant: {response}", |
|
sentiment, |
|
emotion |
|
]) |
|
|
|
return '\n'.join(history_list), audio_output, sentiment, emotion |
|
|
|
except Exception as e: |
|
logger.error(f"Error processing audio: {str(e)}") |
|
return str(history or ""), None, f"Error: {str(e)}", f"Error: {str(e)}" |
|
|
|
def text_to_speech(self, text: str) -> Optional[str]: |
|
"""Convert text to speech""" |
|
try: |
|
output_path = os.path.join(self.temp_dir, f"response_{hash(text)}.wav") |
|
tts = gTTS(text=text, lang='en') |
|
tts.save(output_path) |
|
return output_path |
|
except Exception as e: |
|
logger.error(f"Error in text-to-speech conversion: {str(e)}") |
|
return None |
|
|
|
def create_interface(): |
|
"""Create the Gradio interface""" |
|
try: |
|
|
|
agent = EnhancedAIAgent() |
|
|
|
|
|
with gr.Blocks() as interface: |
|
gr.Markdown("# AI Speech Analysis App") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
|
|
audio_input = gr.Audio( |
|
label="Record your message", |
|
type="filepath" |
|
) |
|
|
|
|
|
analyze_button = gr.Button( |
|
"Analyze Speech", |
|
variant="primary" |
|
) |
|
|
|
|
|
chat_history = gr.Textbox( |
|
label="Conversation History", |
|
lines=10, |
|
interactive=False |
|
) |
|
|
|
|
|
audio_output = gr.Audio( |
|
label="AI Response", |
|
type="filepath" |
|
) |
|
|
|
|
|
sentiment_display = gr.Textbox( |
|
label="Sentiment Analysis", |
|
interactive=False |
|
) |
|
|
|
emotion_display = gr.Textbox( |
|
label="Emotion Recognition", |
|
interactive=False |
|
) |
|
|
|
|
|
analyze_button.click( |
|
fn=agent.process_audio, |
|
inputs=[audio_input, chat_history], |
|
outputs=[chat_history, audio_output, sentiment_display, emotion_display] |
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
### How to Use: |
|
1. Click the microphone icon to start recording |
|
2. Speak your message |
|
3. Click stop when finished |
|
4. Press "Analyze Speech" to process your message |
|
5. View the results and listen to the response |
|
""") |
|
|
|
return interface |
|
|
|
except Exception as e: |
|
logger.error(f"Error creating interface: {str(e)}") |
|
raise |
|
|
|
|
|
demo = create_interface() |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |