import os import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from sentence_transformers import SentenceTransformer import numpy as np from gtts import gTTS import tempfile import logging from typing import Optional, Tuple import os # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Before initializing any Hugging Face components, unset proxy environment variables: os.environ.pop("HTTP_PROXY", None) os.environ.pop("HTTPS_PROXY", None) os.environ.pop("http_proxy", None) os.environ.pop("https_proxy", None) def create_temp_dir(): """Create temporary directory for audio files if it doesn't exist""" temp_dir = "temp_audio" if not os.path.exists(temp_dir): os.makedirs(temp_dir) return temp_dir class EnhancedAIAgent: # Corrected the typo: 'lass' to 'class' def __init__(self): """Initialize the AI agent with models and pipelines""" try: logger.info("Initializing AI Agent...") # Set device self.device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {self.device}") # Initialize speech recognition logger.info("Loading speech recognition model...") self.speech_recognizer = pipeline( "automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=self.device ) # Initialize sentiment analysis logger.info("Loading sentiment analysis model...") self.sentiment_analyzer = pipeline( "sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=self.device ) # Initialize emotion recognition logger.info("Loading emotion recognition model...") self.emotion_recognizer = pipeline( "text-classification", model="j-hartmann/emotion-english-distilroberta-base", device=self.device ) # Create temporary directory for audio files self.temp_dir = create_temp_dir() logger.info("AI Agent initialized successfully") except Exception as e: logger.error(f"Error initializing AI Agent: {str(e)}") raise def process_audio( self, audio_path: Optional[str], history: Optional[str] ) -> Tuple[str, Optional[str], str, str]: """Process audio input and generate response""" try: # Initialize history history_list = [] if not history else history.split('\n') # Handle no audio input if not audio_path: return history or "", None, "", "" # Convert speech to text logger.info("Converting speech to text...") user_input = self.speech_recognizer(audio_path)["text"] logger.info(f"Transcribed text: {user_input}") # Generate simple response (can be enhanced later) response = f"I heard you say: {user_input}" # Analyze sentiment logger.info("Analyzing sentiment...") sentiment_result = self.sentiment_analyzer(user_input)[0] sentiment = f"Sentiment: {sentiment_result['label']} ({sentiment_result['score']:.2f})" # Analyze emotion logger.info("Analyzing emotion...") emotion_result = self.emotion_recognizer(user_input)[0] emotion = f"Emotion: {emotion_result['label']} ({emotion_result['score']:.2f})" # Generate audio response logger.info("Generating audio response...") audio_output = self.text_to_speech(response) # Update history history_list.extend([ f"User: {user_input}", f"Assistant: {response}", sentiment, emotion ]) return '\n'.join(history_list), audio_output, sentiment, emotion except Exception as e: logger.error(f"Error processing audio: {str(e)}") return str(history or ""), None, f"Error: {str(e)}", f"Error: {str(e)}" def text_to_speech(self, text: str) -> Optional[str]: """Convert text to speech""" try: output_path = os.path.join(self.temp_dir, f"response_{hash(text)}.wav") tts = gTTS(text=text, lang='en') tts.save(output_path) return output_path except Exception as e: logger.error(f"Error in text-to-speech conversion: {str(e)}") return None def create_interface(): """Create the Gradio interface""" try: # Initialize AI Agent agent = EnhancedAIAgent() # Define interface with gr.Blocks() as interface: gr.Markdown("# AI Speech Analysis App") with gr.Row(): with gr.Column(scale=2): # Audio input audio_input = gr.Audio( label="Record your message", type="filepath" ) # Analyze button analyze_button = gr.Button( "Analyze Speech", variant="primary" ) # Chat history chat_history = gr.Textbox( label="Conversation History", lines=10, interactive=False ) # Audio output audio_output = gr.Audio( label="AI Response", type="filepath" ) # Analysis displays sentiment_display = gr.Textbox( label="Sentiment Analysis", interactive=False ) emotion_display = gr.Textbox( label="Emotion Recognition", interactive=False ) # Set up event handler analyze_button.click( # Ensure this line is aligned with the outer 'with' block fn=agent.process_audio, inputs=[audio_input, chat_history], outputs=[chat_history, audio_output, sentiment_display, emotion_display] ) # Instructions gr.Markdown(""" ### How to Use: 1. Click the microphone icon to start recording 2. Speak your message 3. Click stop when finished 4. Press "Analyze Speech" to process your message 5. View the results and listen to the response """) return interface except Exception as e: logger.error(f"Error creating interface: {str(e)}") raise # Create and launch the interface demo = create_interface() # Launch the app if __name__ == "__main__": demo.launch()