Anupam251272's picture
Create app.py
a5f5dfe verified
import os
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
from gtts import gTTS
import tempfile
import logging
from typing import Optional, Tuple
import os
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Before initializing any Hugging Face components, unset proxy environment variables:
os.environ.pop("HTTP_PROXY", None)
os.environ.pop("HTTPS_PROXY", None)
os.environ.pop("http_proxy", None)
os.environ.pop("https_proxy", None)
def create_temp_dir():
"""Create temporary directory for audio files if it doesn't exist"""
temp_dir = "temp_audio"
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
return temp_dir
class EnhancedAIAgent: # Corrected the typo: 'lass' to 'class'
def __init__(self):
"""Initialize the AI agent with models and pipelines"""
try:
logger.info("Initializing AI Agent...")
# Set device
self.device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {self.device}")
# Initialize speech recognition
logger.info("Loading speech recognition model...")
self.speech_recognizer = pipeline(
"automatic-speech-recognition",
model="facebook/wav2vec2-base-960h",
device=self.device
)
# Initialize sentiment analysis
logger.info("Loading sentiment analysis model...")
self.sentiment_analyzer = pipeline(
"sentiment-analysis",
model="distilbert-base-uncased-finetuned-sst-2-english",
device=self.device
)
# Initialize emotion recognition
logger.info("Loading emotion recognition model...")
self.emotion_recognizer = pipeline(
"text-classification",
model="j-hartmann/emotion-english-distilroberta-base",
device=self.device
)
# Create temporary directory for audio files
self.temp_dir = create_temp_dir()
logger.info("AI Agent initialized successfully")
except Exception as e:
logger.error(f"Error initializing AI Agent: {str(e)}")
raise
def process_audio(
self,
audio_path: Optional[str],
history: Optional[str]
) -> Tuple[str, Optional[str], str, str]:
"""Process audio input and generate response"""
try:
# Initialize history
history_list = [] if not history else history.split('\n')
# Handle no audio input
if not audio_path:
return history or "", None, "", ""
# Convert speech to text
logger.info("Converting speech to text...")
user_input = self.speech_recognizer(audio_path)["text"]
logger.info(f"Transcribed text: {user_input}")
# Generate simple response (can be enhanced later)
response = f"I heard you say: {user_input}"
# Analyze sentiment
logger.info("Analyzing sentiment...")
sentiment_result = self.sentiment_analyzer(user_input)[0]
sentiment = f"Sentiment: {sentiment_result['label']} ({sentiment_result['score']:.2f})"
# Analyze emotion
logger.info("Analyzing emotion...")
emotion_result = self.emotion_recognizer(user_input)[0]
emotion = f"Emotion: {emotion_result['label']} ({emotion_result['score']:.2f})"
# Generate audio response
logger.info("Generating audio response...")
audio_output = self.text_to_speech(response)
# Update history
history_list.extend([
f"User: {user_input}",
f"Assistant: {response}",
sentiment,
emotion
])
return '\n'.join(history_list), audio_output, sentiment, emotion
except Exception as e:
logger.error(f"Error processing audio: {str(e)}")
return str(history or ""), None, f"Error: {str(e)}", f"Error: {str(e)}"
def text_to_speech(self, text: str) -> Optional[str]:
"""Convert text to speech"""
try:
output_path = os.path.join(self.temp_dir, f"response_{hash(text)}.wav")
tts = gTTS(text=text, lang='en')
tts.save(output_path)
return output_path
except Exception as e:
logger.error(f"Error in text-to-speech conversion: {str(e)}")
return None
def create_interface():
"""Create the Gradio interface"""
try:
# Initialize AI Agent
agent = EnhancedAIAgent()
# Define interface
with gr.Blocks() as interface:
gr.Markdown("# AI Speech Analysis App")
with gr.Row():
with gr.Column(scale=2):
# Audio input
audio_input = gr.Audio(
label="Record your message",
type="filepath"
)
# Analyze button
analyze_button = gr.Button(
"Analyze Speech",
variant="primary"
)
# Chat history
chat_history = gr.Textbox(
label="Conversation History",
lines=10,
interactive=False
)
# Audio output
audio_output = gr.Audio(
label="AI Response",
type="filepath"
)
# Analysis displays
sentiment_display = gr.Textbox(
label="Sentiment Analysis",
interactive=False
)
emotion_display = gr.Textbox(
label="Emotion Recognition",
interactive=False
)
# Set up event handler
analyze_button.click( # Ensure this line is aligned with the outer 'with' block
fn=agent.process_audio,
inputs=[audio_input, chat_history],
outputs=[chat_history, audio_output, sentiment_display, emotion_display]
)
# Instructions
gr.Markdown("""
### How to Use:
1. Click the microphone icon to start recording
2. Speak your message
3. Click stop when finished
4. Press "Analyze Speech" to process your message
5. View the results and listen to the response
""")
return interface
except Exception as e:
logger.error(f"Error creating interface: {str(e)}")
raise
# Create and launch the interface
demo = create_interface()
# Launch the app
if __name__ == "__main__":
demo.launch()