Anupam251272 commited on
Commit
a5f5dfe
1 Parent(s): fa489aa

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +214 -0
app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
5
+ from sentence_transformers import SentenceTransformer
6
+ import numpy as np
7
+ from gtts import gTTS
8
+ import tempfile
9
+ import logging
10
+ from typing import Optional, Tuple
11
+ import os
12
+
13
+ # Configure logging
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
17
+ )
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Before initializing any Hugging Face components, unset proxy environment variables:
21
+ os.environ.pop("HTTP_PROXY", None)
22
+ os.environ.pop("HTTPS_PROXY", None)
23
+ os.environ.pop("http_proxy", None)
24
+ os.environ.pop("https_proxy", None)
25
+
26
+
27
+ def create_temp_dir():
28
+ """Create temporary directory for audio files if it doesn't exist"""
29
+ temp_dir = "temp_audio"
30
+ if not os.path.exists(temp_dir):
31
+ os.makedirs(temp_dir)
32
+ return temp_dir
33
+
34
+ class EnhancedAIAgent: # Corrected the typo: 'lass' to 'class'
35
+ def __init__(self):
36
+ """Initialize the AI agent with models and pipelines"""
37
+ try:
38
+ logger.info("Initializing AI Agent...")
39
+
40
+ # Set device
41
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
42
+ logger.info(f"Using device: {self.device}")
43
+
44
+ # Initialize speech recognition
45
+ logger.info("Loading speech recognition model...")
46
+ self.speech_recognizer = pipeline(
47
+ "automatic-speech-recognition",
48
+ model="facebook/wav2vec2-base-960h",
49
+ device=self.device
50
+ )
51
+
52
+ # Initialize sentiment analysis
53
+ logger.info("Loading sentiment analysis model...")
54
+ self.sentiment_analyzer = pipeline(
55
+ "sentiment-analysis",
56
+ model="distilbert-base-uncased-finetuned-sst-2-english",
57
+ device=self.device
58
+ )
59
+
60
+ # Initialize emotion recognition
61
+ logger.info("Loading emotion recognition model...")
62
+ self.emotion_recognizer = pipeline(
63
+ "text-classification",
64
+ model="j-hartmann/emotion-english-distilroberta-base",
65
+ device=self.device
66
+ )
67
+
68
+ # Create temporary directory for audio files
69
+ self.temp_dir = create_temp_dir()
70
+
71
+ logger.info("AI Agent initialized successfully")
72
+
73
+ except Exception as e:
74
+ logger.error(f"Error initializing AI Agent: {str(e)}")
75
+ raise
76
+
77
+ def process_audio(
78
+ self,
79
+ audio_path: Optional[str],
80
+ history: Optional[str]
81
+ ) -> Tuple[str, Optional[str], str, str]:
82
+ """Process audio input and generate response"""
83
+ try:
84
+ # Initialize history
85
+ history_list = [] if not history else history.split('\n')
86
+
87
+ # Handle no audio input
88
+ if not audio_path:
89
+ return history or "", None, "", ""
90
+
91
+ # Convert speech to text
92
+ logger.info("Converting speech to text...")
93
+ user_input = self.speech_recognizer(audio_path)["text"]
94
+ logger.info(f"Transcribed text: {user_input}")
95
+
96
+ # Generate simple response (can be enhanced later)
97
+ response = f"I heard you say: {user_input}"
98
+
99
+ # Analyze sentiment
100
+ logger.info("Analyzing sentiment...")
101
+ sentiment_result = self.sentiment_analyzer(user_input)[0]
102
+ sentiment = f"Sentiment: {sentiment_result['label']} ({sentiment_result['score']:.2f})"
103
+
104
+ # Analyze emotion
105
+ logger.info("Analyzing emotion...")
106
+ emotion_result = self.emotion_recognizer(user_input)[0]
107
+ emotion = f"Emotion: {emotion_result['label']} ({emotion_result['score']:.2f})"
108
+
109
+ # Generate audio response
110
+ logger.info("Generating audio response...")
111
+ audio_output = self.text_to_speech(response)
112
+
113
+ # Update history
114
+ history_list.extend([
115
+ f"User: {user_input}",
116
+ f"Assistant: {response}",
117
+ sentiment,
118
+ emotion
119
+ ])
120
+
121
+ return '\n'.join(history_list), audio_output, sentiment, emotion
122
+
123
+ except Exception as e:
124
+ logger.error(f"Error processing audio: {str(e)}")
125
+ return str(history or ""), None, f"Error: {str(e)}", f"Error: {str(e)}"
126
+
127
+ def text_to_speech(self, text: str) -> Optional[str]:
128
+ """Convert text to speech"""
129
+ try:
130
+ output_path = os.path.join(self.temp_dir, f"response_{hash(text)}.wav")
131
+ tts = gTTS(text=text, lang='en')
132
+ tts.save(output_path)
133
+ return output_path
134
+ except Exception as e:
135
+ logger.error(f"Error in text-to-speech conversion: {str(e)}")
136
+ return None
137
+
138
+ def create_interface():
139
+ """Create the Gradio interface"""
140
+ try:
141
+ # Initialize AI Agent
142
+ agent = EnhancedAIAgent()
143
+
144
+ # Define interface
145
+ with gr.Blocks() as interface:
146
+ gr.Markdown("# AI Speech Analysis App")
147
+
148
+ with gr.Row():
149
+ with gr.Column(scale=2):
150
+ # Audio input
151
+ audio_input = gr.Audio(
152
+ label="Record your message",
153
+ type="filepath"
154
+ )
155
+
156
+ # Analyze button
157
+ analyze_button = gr.Button(
158
+ "Analyze Speech",
159
+ variant="primary"
160
+ )
161
+
162
+ # Chat history
163
+ chat_history = gr.Textbox(
164
+ label="Conversation History",
165
+ lines=10,
166
+ interactive=False
167
+ )
168
+
169
+ # Audio output
170
+ audio_output = gr.Audio(
171
+ label="AI Response",
172
+ type="filepath"
173
+ )
174
+
175
+ # Analysis displays
176
+ sentiment_display = gr.Textbox(
177
+ label="Sentiment Analysis",
178
+ interactive=False
179
+ )
180
+
181
+ emotion_display = gr.Textbox(
182
+ label="Emotion Recognition",
183
+ interactive=False
184
+ )
185
+
186
+ # Set up event handler
187
+ analyze_button.click( # Ensure this line is aligned with the outer 'with' block
188
+ fn=agent.process_audio,
189
+ inputs=[audio_input, chat_history],
190
+ outputs=[chat_history, audio_output, sentiment_display, emotion_display]
191
+ )
192
+
193
+ # Instructions
194
+ gr.Markdown("""
195
+ ### How to Use:
196
+ 1. Click the microphone icon to start recording
197
+ 2. Speak your message
198
+ 3. Click stop when finished
199
+ 4. Press "Analyze Speech" to process your message
200
+ 5. View the results and listen to the response
201
+ """)
202
+
203
+ return interface
204
+
205
+ except Exception as e:
206
+ logger.error(f"Error creating interface: {str(e)}")
207
+ raise
208
+
209
+ # Create and launch the interface
210
+ demo = create_interface()
211
+
212
+ # Launch the app
213
+ if __name__ == "__main__":
214
+ demo.launch()