Spaces:

itsdvirani
/

imBesideYou

Running

App Files Files Community

imBesideYou / transcrip_score.py

itsdvirani

Upload 6 files

625572e verified 9 days ago

raw

history blame contribute delete

No virus

11.9 kB

	import streamlit as st
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	import google.generativeai as genai
	import numpy as np

	# Set up Gemini API (replace with your actual API key)
	genai.configure(api_key='AIzaSyBDeJo3pioFL92ErFTtmRBmWt5diryp0E0')

	def load_and_preprocess_data(file):
	data = pd.read_csv(file)
	numeric_columns = ['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic', 'speech_speed']
	for col in numeric_columns:
	data[col] = pd.to_numeric(data[col], errors='coerce')
	data = data.dropna()
	return data

	def calculate_transcript_stats(data, feature):
	return {
	'mean': data[feature].mean(),
	'median': data[feature].median(),
	'std': data[feature].std()
	}

	def calculate_feature_percentages(data):
	feature_columns = ['positive', 'negative', 'neutral']
	total = data[feature_columns].sum().sum()
	percentages = (data[feature_columns].sum() / total * 100).round(2)
	return percentages

	def format_transcript_prompt(features, data_dict, selected_candidates, feature_percentages=None):
	prompt = """
	You are an AI assistant specializing in transcript analysis. You have access to transcript data for the following candidates: {CANDIDATES}, focusing on these features: {FEATURES}.

	{STATS}

	Based on this data:

	1. Compare the overall levels of the specified features across the selected candidates. Which candidates exhibit more positive/negative sentiment or higher confidence?
	2. Analyze the distribution of features for each selected candidate. Are they evenly spread or concentrated at certain levels?
	3. Discuss any significant differences in speech patterns across the selected candidates. What might these differences suggest about their communication styles?
	4. Consider the variability of features for each selected candidate. Do some candidates have more consistent levels, or do they fluctuate more?
	5. Based on this transcript data, hypothesize about potential speaking styles or topics that might contribute to the observed patterns.
	6. How might the differences in speech patterns between these selected candidates affect the overall listener experience?

	Provide a detailed analysis addressing these points, using specific data references where relevant. Your analysis should offer insights into how these features characterize each selected candidate's communication style and what this reveals about their potential impact on the audience.
	"""

	stats = ""
	all_features = ['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic', 'speech_speed']

	for candidate in selected_candidates:
	stats += f"\n{candidate}:\n"
	if feature_percentages is not None and isinstance(feature_percentages, pd.Series):
	for feature in feature_percentages.index:
	stats += f"{feature.capitalize()}: {feature_percentages[feature]:.2f}%\n"
	else:
	features_to_analyze = all_features if 'all' in features else features
	for feature in features_to_analyze:
	feature_stats = calculate_transcript_stats(data_dict[candidate], feature)
	stats += f"{feature.capitalize()} - Mean: {feature_stats['mean']:.2f}, Median: {feature_stats['median']:.2f}, Standard Deviation: {feature_stats['std']:.2f}\n"

	features_display = "all features" if 'all' in features else ", ".join(features)
	return prompt.format(CANDIDATES=", ".join(selected_candidates), FEATURES=features_display, STATS=stats)

	def generate_response(prompt, data_dict, features, selected_candidates, feature_percentages=None):
	model = genai.GenerativeModel('gemini-pro')

	analysis_prompt = format_transcript_prompt(features, data_dict, selected_candidates, feature_percentages)

	full_prompt = analysis_prompt + "\n\nUser query: " + prompt
	response = model.generate_content(full_prompt)

	if hasattr(response, 'candidates'):
	if response.candidates:
	content = response.candidates[0].content
	if hasattr(content, 'parts'):
	for part in content.parts:
	if hasattr(part, 'text'):
	return part.text

	return "Error: Unable to extract text from the response. Please check the API response structure."


	def visualize_all_features(data_dict, selected_candidates):
	feature_columns = ['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic',
	'speech_speed']

	# Correlation Matrix
	fig, ax = plt.subplots(figsize=(12, 10))
	combined_data = pd.concat([data_dict[candidate][feature_columns] for candidate in selected_candidates],
	keys=selected_candidates)
	correlation = combined_data.corr()
	sns.heatmap(correlation, annot=True, cmap='coolwarm', ax=ax)
	ax.set_title('Correlation Matrix of Transcript Score Data')
	st.pyplot(fig)

	# Sentiment Distributions
	fig, axes = plt.subplots(1, 3, figsize=(18, 6))
	for i, sentiment in enumerate(['positive', 'negative', 'neutral']):
	for candidate in selected_candidates:
	sns.kdeplot(data_dict[candidate][sentiment], ax=axes[i], label=candidate, shade=True)
	axes[i].set_title(f'Distribution of {sentiment.capitalize()} Sentiment')
	axes[i].legend()
	st.pyplot(fig)

	# Speech Speed Over Time
	fig, ax = plt.subplots(figsize=(12, 6))
	for candidate in selected_candidates:
	ax.plot(data_dict[candidate]['start'], data_dict[candidate]['speech_speed'], label=candidate)
	ax.set_title('Speech Speed Over Time')
	ax.set_xlabel('Time')
	ax.set_ylabel('Speech Speed')
	ax.legend()
	st.pyplot(fig)

	# Speech Speed vs Confidence
	fig, ax = plt.subplots(figsize=(10, 6))
	for candidate in selected_candidates:
	sns.scatterplot(data=data_dict[candidate], x='speech_speed', y='confident', label=candidate, ax=ax)
	ax.set_title('Speech Speed vs Confidence')
	ax.legend()
	st.pyplot(fig)

	return calculate_feature_percentages(pd.concat(data_dict.values()))

	def visualize_single_candidate(data, features):
	if 'all' in features:
	# Correlation Matrix
	fig, ax = plt.subplots(figsize=(12, 10))
	correlation = data[['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic', 'speech_speed']].corr()
	sns.heatmap(correlation, annot=True, cmap='coolwarm', ax=ax)
	ax.set_title('Correlation Matrix of Transcript Score Data')
	st.pyplot(fig)

	# Sentiment Distributions
	fig, axes = plt.subplots(1, 3, figsize=(18, 6))
	sns.histplot(data['positive'], kde=True, ax=axes[0])
	axes[0].set_title('Distribution of Positive Sentiment')
	sns.histplot(data['negative'], kde=True, ax=axes[1])
	axes[1].set_title('Distribution of Negative Sentiment')
	sns.histplot(data['neutral'], kde=True, ax=axes[2])
	axes[2].set_title('Distribution of Neutral Sentiment')
	st.pyplot(fig)

	# Speech Speed Over Time
	fig, ax = plt.subplots(figsize=(12, 6))
	ax.plot(data['start'], data['speech_speed'])
	ax.set_title('Speech Speed Over Time')
	ax.set_xlabel('Time')
	ax.set_ylabel('Speech Speed')
	st.pyplot(fig)

	# Speech Speed vs Confidence
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.scatterplot(data=data, x='speech_speed', y='confident')
	ax.set_title('Speech Speed vs Confidence')
	st.pyplot(fig)

	return calculate_feature_percentages(data).to_dict()
	else:
	# Distribution plot for specific features
	fig, ax = plt.subplots(figsize=(10, 6))
	for feature in features:
	sns.kdeplot(data[feature], ax=ax, label=feature, shade=True)
	ax.set_title(f"Distribution of Selected Features")
	ax.set_xlabel("Value")
	ax.set_ylabel("Density")
	ax.legend()
	st.pyplot(fig)
	return None


	def visualize_comparison(data_dict, features, selected_candidates):
	if 'all' in features or set(features) == set(['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic', 'speech_speed']):
	return visualize_all_features(data_dict, selected_candidates)
	else:
	visualize_specific_features(data_dict, features, selected_candidates)
	return None


	def visualize_specific_features(data_dict, features, selected_candidates):
	for feature in features:
	fig, ax = plt.subplots(figsize=(10, 6))
	for candidate in selected_candidates:
	sns.kdeplot(data_dict[candidate][feature], ax=ax, label=candidate, shade=True)
	ax.set_title(f"Distribution of '{feature}'")
	ax.set_xlabel("Value")
	ax.set_ylabel("Density")
	ax.legend()
	st.pyplot(fig)


	def main():
	st.title("Multi-Candidate Transcript Analysis Chat Interface")

	num_candidates = st.number_input("How many candidates would you like to compare?", min_value=1, max_value=10,
	value=1)

	data_dict = {}
	for i in range(num_candidates):
	uploaded_file = st.file_uploader(f"Choose CSV file for Candidate {i + 1}", type="csv", key=f"candidate_{i + 1}")
	if uploaded_file is not None:
	data = load_and_preprocess_data(uploaded_file)
	data_dict[f"Candidate {i + 1}"] = data

	if len(data_dict) == num_candidates:
	st.success("All files uploaded successfully. You can now start chatting!")

	st.subheader("Data Information")
	for candidate, data in data_dict.items():
	st.write(f"{candidate} Columns:", data.columns.tolist())

	features = ['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic',
	'speech_speed']
	st.write("Available Features:", features)

	if "messages" not in st.session_state:
	st.session_state.messages = []

	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Candidate selection for each query
	selected_candidates = st.multiselect("Select candidates to compare:", list(data_dict.keys()))
	if not selected_candidates:
	selected_candidates = list(data_dict.keys())

	if prompt := st.chat_input("What would you like to know about the candidates' transcripts?"):
	st.chat_message("user").markdown(prompt)
	st.session_state.messages.append({"role": "user", "content": prompt})

	selected_features = []
	if any(keyword in prompt.lower() for keyword in ["all features", "all transcript", "compare all"]):
	selected_features = ['all']
	else:
	for feature in features:
	if feature in prompt.lower():
	selected_features.append(feature)

	if not selected_features:
	selected_features = ['all']

	feature_percentages = None
	with st.chat_message("assistant"):
	feature_percentages = visualize_comparison(data_dict, selected_features, selected_candidates)

	response = generate_response(prompt, data_dict, selected_features, selected_candidates, feature_percentages)

	with st.chat_message("assistant"):
	st.markdown(response)

	st.session_state.messages.append({"role": "assistant", "content": response})

	if st.checkbox("Show raw data"):
	for candidate in selected_candidates:
	st.subheader(f"{candidate} Data")
	st.write(data_dict[candidate])


	if __name__ == "__main__":
	main()