Spaces:

Anupam251272
/

AI-Speech-Analysis-App-RAG

Running

App Files Files Community

Anupam251272 commited on 6 days ago

Commit

a5f5dfe

•

1 Parent(s): fa489aa

Create app.py

Browse files

Files changed (1) hide show

app.py +214 -0

app.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import os
+import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from gtts import gTTS
+import tempfile
+import logging
+from typing import Optional, Tuple
+import os
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Before initializing any Hugging Face components, unset proxy environment variables:
+os.environ.pop("HTTP_PROXY", None)
+os.environ.pop("HTTPS_PROXY", None)
+os.environ.pop("http_proxy", None)
+os.environ.pop("https_proxy", None)
+def create_temp_dir():
+    """Create temporary directory for audio files if it doesn't exist"""
+    temp_dir = "temp_audio"
+    if not os.path.exists(temp_dir):
+        os.makedirs(temp_dir)
+    return temp_dir
+class EnhancedAIAgent:  # Corrected the typo: 'lass' to 'class'
+    def __init__(self):
+        """Initialize the AI agent with models and pipelines"""
+        try:
+            logger.info("Initializing AI Agent...")
+            # Set device
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            logger.info(f"Using device: {self.device}")
+            # Initialize speech recognition
+            logger.info("Loading speech recognition model...")
+            self.speech_recognizer = pipeline(
+                "automatic-speech-recognition",
+                model="facebook/wav2vec2-base-960h",
+                device=self.device
+            )
+            # Initialize sentiment analysis
+            logger.info("Loading sentiment analysis model...")
+            self.sentiment_analyzer = pipeline(
+                "sentiment-analysis",
+                model="distilbert-base-uncased-finetuned-sst-2-english",
+                device=self.device
+            )
+            # Initialize emotion recognition
+            logger.info("Loading emotion recognition model...")
+            self.emotion_recognizer = pipeline(
+                "text-classification",
+                model="j-hartmann/emotion-english-distilroberta-base",
+                device=self.device
+            )
+            # Create temporary directory for audio files
+            self.temp_dir = create_temp_dir()
+            logger.info("AI Agent initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing AI Agent: {str(e)}")
+            raise
+    def process_audio(
+        self,
+        audio_path: Optional[str],
+        history: Optional[str]
+    ) -> Tuple[str, Optional[str], str, str]:
+        """Process audio input and generate response"""
+        try:
+            # Initialize history
+            history_list = [] if not history else history.split('\n')
+            # Handle no audio input
+            if not audio_path:
+                return history or "", None, "", ""
+            # Convert speech to text
+            logger.info("Converting speech to text...")
+            user_input = self.speech_recognizer(audio_path)["text"]
+            logger.info(f"Transcribed text: {user_input}")
+            # Generate simple response (can be enhanced later)
+            response = f"I heard you say: {user_input}"
+            # Analyze sentiment
+            logger.info("Analyzing sentiment...")
+            sentiment_result = self.sentiment_analyzer(user_input)[0]
+            sentiment = f"Sentiment: {sentiment_result['label']} ({sentiment_result['score']:.2f})"
+            # Analyze emotion
+            logger.info("Analyzing emotion...")
+            emotion_result = self.emotion_recognizer(user_input)[0]
+            emotion = f"Emotion: {emotion_result['label']} ({emotion_result['score']:.2f})"
+            # Generate audio response
+            logger.info("Generating audio response...")
+            audio_output = self.text_to_speech(response)
+            # Update history
+            history_list.extend([
+                f"User: {user_input}",
+                f"Assistant: {response}",
+                sentiment,
+                emotion
+            ])
+            return '\n'.join(history_list), audio_output, sentiment, emotion
+        except Exception as e:
+            logger.error(f"Error processing audio: {str(e)}")
+            return str(history or ""), None, f"Error: {str(e)}", f"Error: {str(e)}"
+    def text_to_speech(self, text: str) -> Optional[str]:
+        """Convert text to speech"""
+        try:
+            output_path = os.path.join(self.temp_dir, f"response_{hash(text)}.wav")
+            tts = gTTS(text=text, lang='en')
+            tts.save(output_path)
+            return output_path
+        except Exception as e:
+            logger.error(f"Error in text-to-speech conversion: {str(e)}")
+            return None
+def create_interface():
+    """Create the Gradio interface"""
+    try:
+        # Initialize AI Agent
+        agent = EnhancedAIAgent()
+        # Define interface
+        with gr.Blocks() as interface:
+            gr.Markdown("# AI Speech Analysis App")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    # Audio input
+                    audio_input = gr.Audio(
+                        label="Record your message",
+                        type="filepath"
+                    )
+                    # Analyze button
+                    analyze_button = gr.Button(
+                        "Analyze Speech",
+                        variant="primary"
+                    )
+                    # Chat history
+                    chat_history = gr.Textbox(
+                        label="Conversation History",
+                        lines=10,
+                        interactive=False
+                    )
+                    # Audio output
+                    audio_output = gr.Audio(
+                        label="AI Response",
+                        type="filepath"
+                    )
+                    # Analysis displays
+                    sentiment_display = gr.Textbox(
+                        label="Sentiment Analysis",
+                        interactive=False
+                    )
+                    emotion_display = gr.Textbox(
+                        label="Emotion Recognition",
+                        interactive=False
+                    )
+            # Set up event handler
+            analyze_button.click(  # Ensure this line is aligned with the outer 'with' block
+                fn=agent.process_audio,
+                inputs=[audio_input, chat_history],
+                outputs=[chat_history, audio_output, sentiment_display, emotion_display]
+            )
+            # Instructions
+            gr.Markdown("""
+            ### How to Use:
+            1. Click the microphone icon to start recording
+            2. Speak your message
+            3. Click stop when finished
+            4. Press "Analyze Speech" to process your message
+            5. View the results and listen to the response
+            """)
+        return interface
+    except Exception as e:
+        logger.error(f"Error creating interface: {str(e)}")
+        raise
+# Create and launch the interface
+demo = create_interface()
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()