Spaces:

Anupam251272
/

AI-Speech-Analysis-App-RAG

Running

App Files Files Community

AI-Speech-Analysis-App-RAG / app.py

Anupam251272

Create app.py

a5f5dfe verified 6 days ago

raw

history blame contribute delete

7.61 kB

	import os
	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
	from sentence_transformers import SentenceTransformer
	import numpy as np
	from gtts import gTTS
	import tempfile
	import logging
	from typing import Optional, Tuple
	import os

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	# Before initializing any Hugging Face components, unset proxy environment variables:
	os.environ.pop("HTTP_PROXY", None)
	os.environ.pop("HTTPS_PROXY", None)
	os.environ.pop("http_proxy", None)
	os.environ.pop("https_proxy", None)


	def create_temp_dir():
	"""Create temporary directory for audio files if it doesn't exist"""
	temp_dir = "temp_audio"
	if not os.path.exists(temp_dir):
	os.makedirs(temp_dir)
	return temp_dir

	class EnhancedAIAgent: # Corrected the typo: 'lass' to 'class'
	def __init__(self):
	"""Initialize the AI agent with models and pipelines"""
	try:
	logger.info("Initializing AI Agent...")

	# Set device
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"Using device: {self.device}")

	# Initialize speech recognition
	logger.info("Loading speech recognition model...")
	self.speech_recognizer = pipeline(
	"automatic-speech-recognition",
	model="facebook/wav2vec2-base-960h",
	device=self.device
	)

	# Initialize sentiment analysis
	logger.info("Loading sentiment analysis model...")
	self.sentiment_analyzer = pipeline(
	"sentiment-analysis",
	model="distilbert-base-uncased-finetuned-sst-2-english",
	device=self.device
	)

	# Initialize emotion recognition
	logger.info("Loading emotion recognition model...")
	self.emotion_recognizer = pipeline(
	"text-classification",
	model="j-hartmann/emotion-english-distilroberta-base",
	device=self.device
	)

	# Create temporary directory for audio files
	self.temp_dir = create_temp_dir()

	logger.info("AI Agent initialized successfully")

	except Exception as e:
	logger.error(f"Error initializing AI Agent: {str(e)}")
	raise

	def process_audio(
	self,
	audio_path: Optional[str],
	history: Optional[str]
	) -> Tuple[str, Optional[str], str, str]:
	"""Process audio input and generate response"""
	try:
	# Initialize history
	history_list = [] if not history else history.split('\n')

	# Handle no audio input
	if not audio_path:
	return history or "", None, "", ""

	# Convert speech to text
	logger.info("Converting speech to text...")
	user_input = self.speech_recognizer(audio_path)["text"]
	logger.info(f"Transcribed text: {user_input}")

	# Generate simple response (can be enhanced later)
	response = f"I heard you say: {user_input}"

	# Analyze sentiment
	logger.info("Analyzing sentiment...")
	sentiment_result = self.sentiment_analyzer(user_input)[0]
	sentiment = f"Sentiment: {sentiment_result['label']} ({sentiment_result['score']:.2f})"

	# Analyze emotion
	logger.info("Analyzing emotion...")
	emotion_result = self.emotion_recognizer(user_input)[0]
	emotion = f"Emotion: {emotion_result['label']} ({emotion_result['score']:.2f})"

	# Generate audio response
	logger.info("Generating audio response...")
	audio_output = self.text_to_speech(response)

	# Update history
	history_list.extend([
	f"User: {user_input}",
	f"Assistant: {response}",
	sentiment,
	emotion
	])

	return '\n'.join(history_list), audio_output, sentiment, emotion

	except Exception as e:
	logger.error(f"Error processing audio: {str(e)}")
	return str(history or ""), None, f"Error: {str(e)}", f"Error: {str(e)}"

	def text_to_speech(self, text: str) -> Optional[str]:
	"""Convert text to speech"""
	try:
	output_path = os.path.join(self.temp_dir, f"response_{hash(text)}.wav")
	tts = gTTS(text=text, lang='en')
	tts.save(output_path)
	return output_path
	except Exception as e:
	logger.error(f"Error in text-to-speech conversion: {str(e)}")
	return None

	def create_interface():
	"""Create the Gradio interface"""
	try:
	# Initialize AI Agent
	agent = EnhancedAIAgent()

	# Define interface
	with gr.Blocks() as interface:
	gr.Markdown("# AI Speech Analysis App")

	with gr.Row():
	with gr.Column(scale=2):
	# Audio input
	audio_input = gr.Audio(
	label="Record your message",
	type="filepath"
	)

	# Analyze button
	analyze_button = gr.Button(
	"Analyze Speech",
	variant="primary"
	)

	# Chat history
	chat_history = gr.Textbox(
	label="Conversation History",
	lines=10,
	interactive=False
	)

	# Audio output
	audio_output = gr.Audio(
	label="AI Response",
	type="filepath"
	)

	# Analysis displays
	sentiment_display = gr.Textbox(
	label="Sentiment Analysis",
	interactive=False
	)

	emotion_display = gr.Textbox(
	label="Emotion Recognition",
	interactive=False
	)

	# Set up event handler
	analyze_button.click( # Ensure this line is aligned with the outer 'with' block
	fn=agent.process_audio,
	inputs=[audio_input, chat_history],
	outputs=[chat_history, audio_output, sentiment_display, emotion_display]
	)

	# Instructions
	gr.Markdown("""
	### How to Use:
	1. Click the microphone icon to start recording
	2. Speak your message
	3. Click stop when finished
	4. Press "Analyze Speech" to process your message
	5. View the results and listen to the response
	""")

	return interface

	except Exception as e:
	logger.error(f"Error creating interface: {str(e)}")
	raise

	# Create and launch the interface
	demo = create_interface()

	# Launch the app
	if __name__ == "__main__":
	demo.launch()