#!/usr/bin/env python3 # from googleapiclient.discovery import build # from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer # import pandas as pd # import streamlit as st # import re # # Set up the YouTube API client # api_key = 'AIzaSyAtaMM03J79pb2vhBOvsIYMlQ84sx9Fb2U' # Replace with your API key # youtube = build('youtube', 'v3', developerKey=api_key) # # Set up the Reddit API client (PRAW) # reddit = praw.Reddit( # client_id='EhlUF9EavT4rAx42jQshKQ', # Replace with your Reddit client_id # client_secret='Zwc8iLJN8saS3B6booPKjabXw63cZQ', # Replace with your Reddit client_secret # user_agent='FondantOk6255' # Replace with your user_agent #) import streamlit as st st.set_page_config(page_title="Reddit Comment Analyzer", layout="wide") from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from wordcloud import WordCloud import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import re import numpy as np from nltk.corpus import stopwords from collections import Counter import praw import nltk from datetime import datetime from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from transformers import pipeline # Load sentiment analysis pipeline sentiment_analyzer = pipeline('sentiment-analysis') # Load summarization pipeline summarizer = pipeline('summarization') def analyze_sentiment(comment): if len(comment) <= 500: # Skip long comments return sentiment_analyzer(comment)[0]['label'] else: return 'neutral' def summarize_text(text): if len(text) <= 500: # Skip long text for summarization return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]['summary_text'] else: return text[:500] + '...' # Return truncated version # Access secrets using st.secrets reddit_client_id = st.secrets["client_id"] reddit_client_secret = st.secrets["client_secret"] reddit_user_agent = st.secrets["user_agent"] # Initialize Reddit API client using the API keys reddit = praw.Reddit( client_id=reddit_client_id, client_secret=reddit_client_secret, user_agent=reddit_user_agent ) # VADER sentiment analyzer setup analyzer = SentimentIntensityAnalyzer() # Download stopwords nltk.download('stopwords') stop_words_set = ['at', 'how', 'do', 'm', 'during', 'again', 'been', 'dont', 'itself', 'from', 'in', 'myself', "wouldn't", 'which', 'than', 'yourselves', 'her', 's', 'further', 'won', 'my', 'more', 'would', 'no', 'some', 'yours', "weren't", "haven't", 'over', 'couldn', 'against', "mustn't", 'same', 'was', 'himself', "aren't", 'through', 'shan', 'he', "mightn't", 'only', 'on', 't', 'ourselves', 'these', 'other', 'up', 'about', 'hers', 'hasn', 'it', "doesn't", 'for', 'wouldn', 'doing', 'not', 'his', 'll', 'you', "couldn't", 'too', 'haven', 'those', 'our', 'because', 'im', 'know', 'until', 'to', 'mightn', 'such', 'very', 'needn', 'they', 'or', 'as', 'having', 'isn', 'here', 'didn', "isn't", "i'm", 'most', 'did', 'have', "it's", "hadn't", 'by', 'has', 'into', 'there', 'yourself', 'had', 'am', 'y', 'just', 'don', 'are', 'does', 'like', 'whom', 'should', 'after', 'mustn', 'once', 'below', 'him', 'who', "you're", 'them', 'why', 'your', "you've", "you'll", 'is', "don't", 'aren', 'when', 'so', 'can', 'being', 'and', "should've", 'that', 'above', "didn't", 'hadn', 'doesn', 've', 'ma', 'before', 'out', 'the', 'if', 'where', "shan't", 'under', 'each', 'ain', 'what', "shouldn't", 'down', 'now', 'weren', 'youre', 'a', 'with', "hasn't", 'herself', 'get', 're', "she's", 'of', 'we', "wasn't", 'their', 'theirs', 'but', 'o', "that'll", 'its', 'own', 'wasn', 'all', 'nor', "you'd", 'shouldn', 'both', 'me', 'd', 'between', 'be', 'an', 'any', 'i', 'she', 'this', 'then', "won't", 'were', 'will', "needn't", 'off', 'few', 'themselves', 'ours', 'while'] # Combine custom stopwords with NLTK stopwords stop_words = list(set(stopwords.words('english')).union(stop_words_set)) # Set up the TfidfVectorizer using the combined stop words # vectorizer = TfidfVectorizer(stop_words=list(stop_words)) # Convert the set to a list before passing to TfidfVectorizer # stop_words_list = list(stop_words) # Verify that stop_words_list is a list of strings # st.write(stop_words_list[:10]) # Print first 10 stop words for verification # Use the vectorizer and pass the stop words list try: vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=5) print("Stop words applied:", vectorizer.get_stop_words()) st.write("TfidfVectorizer initialized successfully!") except Exception as e: st.error(f"Error initializing TfidfVectorizer: {e}") # Streamlit app structure st.title("Reddit Keyword-Based Comment Analyzer") @st.cache def fetch_reddit_data(query, max_results=50, min_score=10): posts = reddit.subreddit('all').search(query, limit=max_results) comments, timestamps, scores = [], [], [] for post in posts: post.comments.replace_more(limit=0) for comment in post.comments.list(): if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc: if comment.score >= min_score: # Filter comments by minimum score (upvotes) comments.append(comment.body) timestamps.append(pd.to_datetime(comment.created_utc, unit='s')) scores.append(comment.score) # Store the comment score for reference return comments, timestamps, scores def preprocess_text(text): # Normalize contractions and remove non-alphabetic characters text = re.sub(r"[^\w\s]", '', text.lower()) # Removes punctuation and converts to lower case text = re.sub(r'\s+', ' ', text) # Removes excess whitespace return text def analyze_sentiment(comments): return [analyzer.polarity_scores(comment)['compound'] for comment in comments] def generate_wordcloud(comments): filtered_words = ' '.join([word for word in ' '.join(comments).split() if word not in stop_words]) return WordCloud(width=400, height=400, background_color='white').generate(filtered_words) # Extract features for keywords def extract_features(comments): vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50) X = vectorizer.fit_transform(comments) return vectorizer.get_feature_names_out(), X.sum(axis=0).A1 query = st.text_input("Enter a keyword to search for Reddit comments", value="data analyst bootcamp online course") start_date = st.date_input("Start Date", value=pd.to_datetime("2024-01-01").date()) end_date = st.date_input("End Date", value=pd.to_datetime("today").date()) print("=================") if st.button("Analyze"): comments, timestamps, score = fetch_reddit_data(query, max_results=50, min_score=5) print(f'Fetched {len(comments)} comments.') if not comments: st.warning("No comments found for this search query.") else: # Clean the comments before passing them to TfidfVectorizer cleaned_comments = [preprocess_text(comment) for comment in comments] print("Sample of cleaned comments:") print(cleaned_comments[:5]) sentiment_scores = analyze_sentiment(cleaned_comments) df = pd.DataFrame({ 'comment': cleaned_comments, 'sentiment': sentiment_scores, 'created_at': timestamps }) # Ensure created_at is in datetime format df['created_at'] = pd.to_datetime(df['created_at']) # Set the datetime index for resampling df.set_index('created_at', inplace=True) # Filter by date range df = df[(df.index >= pd.Timestamp(start_date)) & (df.index <= pd.Timestamp(end_date))] df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral') # Save results in session state st.session_state.df = df st.session_state.cleaned_comments = cleaned_comments # If results are in session state, retrieve them if 'df' in st.session_state: df = st.session_state.df cleaned_comments = st.session_state.cleaned_comments st.subheader("Key Metrics") col1, col2, col3, col4 = st.columns(4) col1.metric("Total Comments", len(df)) col2.metric("Positive", len(df[df['sentiment_category'] == 'positive'])) col3.metric("Neutral", len(df[df['sentiment_category'] == 'neutral'])) col4.metric("Negative", len(df[df['sentiment_category'] == 'negative'])) # Sentiment Distribution st.subheader("Sentiment Distribution") sentiment_counts = df['sentiment_category'].value_counts() fig1, ax1 = plt.subplots() ax1.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90) ax1.axis('equal') # Top Comments Distribution # Keywords Affecting Sentiment st.subheader("Top Keywords Affecting Sentiment") vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50) X = vectorizer.fit_transform(df['comment']) features = vectorizer.get_feature_names_out() scores = np.asarray(X.mean(axis=0)).flatten() keywords_df = pd.DataFrame({'Keyword': features, 'Score': scores}) top_keywords = keywords_df.sort_values(by='Score', ascending=False).head(10) fig2, ax2 = plt.subplots(figsize=(10, 5)) top_keywords.plot(kind='bar', x='Keyword', y='Score', ax=ax2, color='darkorange') ax2.set_xlabel("Keyword") ax2.set_ylabel("TF-IDF Score") ax2.set_title("Top Keywords Affecting Sentiment") plt.xticks(rotation=45, ha='right') # Display the charts side by side col1, col2 = st.columns(2) with col1: st.pyplot(fig1) with col2: st.pyplot(fig2) # Visualizations # Word cloud and common words bar chart side by side st.subheader("Word Cloud and Feature Importance Analysis") col1, col2 = st.columns(2) with col1: filtered_words = ' '.join([word for word in ' '.join(cleaned_comments).split() if word.lower() not in stop_words]) wordcloud = WordCloud(width=400, height=400, background_color='white').generate(filtered_words) plt.figure(figsize=(5, 4)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') st.pyplot(plt) with col2: # Feature Importance Analysis feature_names, feature_counts = extract_features(cleaned_comments) feature_df = pd.DataFrame({'Feature': feature_names, 'Count': feature_counts}) feature_df = feature_df.sort_values(by='Count', ascending=False).head(10) fig3, ax3 = plt.subplots(figsize=(10, 5)) feature_plot = feature_df.plot(kind='bar', x='Feature', y='Count', ax=ax3, color='salmon') ax3.set_xlabel("Feature") ax3.set_ylabel("Frequency") ax3.set_title("Top Keywords Impacting Sentiment") plt.xticks(rotation=45, ha='right') st.pyplot(fig3) st.subheader("Sentiment Over Time") sentiment_over_time = df['sentiment'].resample('W').mean() # Resample by week fig2, ax2 = plt.subplots(figsize=(10, 5)) ax2.plot(sentiment_over_time.index, sentiment_over_time.values, marker='o') ax2.set_xlabel("Date") ax2.set_ylabel("Average Sentiment") st.pyplot(fig2) # Sentiment Distribution by Hour of Day st.subheader("Sentiment Distribution by Hour of Day") df['hour'] = df.index.hour sentiment_by_hour = df.groupby('hour')['sentiment'].mean() fig5, ax5 = plt.subplots(figsize=(10, 5)) ax5.bar(sentiment_by_hour.index, sentiment_by_hour.values, color='skyblue') ax5.set_xlabel("Hour of Day") ax5.set_ylabel("Average Sentiment") ax5.set_title("Average Sentiment by Hour of Day") st.pyplot(fig5) # Sentiment Heatmap st.subheader("Sentiment Heatmap by Hour and Day") df['day_of_week'] = df.index.day_name() df['hour'] = df.index.hour heatmap_data = df.groupby(['day_of_week', 'hour'])['sentiment'].mean().unstack() fig4, ax4 = plt.subplots(figsize=(12, 6)) sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt='.2f', ax=ax4) ax4.set_xlabel("Hour of Day") ax4.set_ylabel("Day of Week") ax4.set_title("Sentiment Heatmap by Hour and Day") st.pyplot(fig4) # Filter comments by sentiment st.subheader("Filter Comments by Sentiment") sentiment_option = st.selectbox("Choose Sentiment", ['positive', 'neutral', 'negative']) filtered_comments = df[df['sentiment_category'] == sentiment_option] if not filtered_comments.empty: st.write(filtered_comments[['comment', 'sentiment_category']].head()) else: st.write("No comments found for the selected sentiment.") # Display raw data st.subheader("Raw Data") st.write(df.head()) # Enhanced Top Sentiment-Related Keywords st.subheader("Top Sentiment-Related Keywords") keywords = ['excellent', 'good', 'great', 'bad', 'terrible'] filtered_comments_with_keywords = [comment for comment in cleaned_comments if any(keyword in comment for keyword in keywords)] if filtered_comments_with_keywords: st.write(f"Found {len(filtered_comments_with_keywords)} comments containing sentiment-related keywords.") for i, comment in enumerate(filtered_comments_with_keywords[:10]): st.write(f"**Comment {i+1}:** {comment}") else: st.write("No comments with sentiment-related keywords found.") # import streamlit as st # from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer # from wordcloud import WordCloud # import pandas as pd # import matplotlib.pyplot as plt # import seaborn as sns # import re # import numpy as np # from nltk.corpus import stopwords # from collections import Counter # import praw # import nltk # from datetime import datetime # from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # from transformers import pipeline # # Load sentiment analysis and summarization pipelines # sentiment_analyzer = pipeline('sentiment-analysis') # summarizer = pipeline('summarization') # def analyze_sentiment(comment): # if len(comment) <= 500: # Skip long comments # return sentiment_analyzer(comment)[0]['label'] # else: # return 'neutral' # def summarize_text(text): # if len(text) <= 500: # Skip long text for summarization # return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]['summary_text'] # else: # return text[:500] + '...' # Return truncated version # # Initialize Reddit API client # reddit = praw.Reddit( # client_id='EhlUF9EavT4rAx42jQshKQ', # client_secret='Zwc8iLJN8saS3B6booPKjabXw63cZQ', # user_agent='FondantOk6255' # ) # # VADER sentiment analyzer setup # analyzer = SentimentIntensityAnalyzer() # # Download stopwords # nltk.download('stopwords') # stop_words_set = set(stopwords.words('english')) # stop_words_list = list(stop_words_set) # vectorizer = TfidfVectorizer(stop_words=stop_words_list) # # Streamlit app structure # st.set_page_config(page_title="Reddit Comment Analyzer", layout="wide") # st.title("Reddit Keyword-Based Comment Analyzer") # # @st.cache # # def fetch_reddit_data(query, max_results=50): # # posts = reddit.subreddit('all').search(query, limit=max_results) # # comments, timestamps = [], [] # # for post in posts: # # post.comments.replace_more(limit=0) # # for comment in post.comments.list(): # # if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc: # # comments.append(comment.body) # # timestamps.append(pd.to_datetime(comment.created_utc, unit='s')) # # return comments, timestamps # @st.cache # def fetch_reddit_data(query, max_posts=50, max_comments_per_post=10): # posts = reddit.subreddit('all').search(query, limit=max_posts) # comments, timestamps = [], [] # for post in posts: # post.comments.replace_more(limit=0) # Replace MoreComments with actual comments # comment_count = 0 # for comment in post.comments.list(): # if comment_count >= max_comments_per_post: # Stop after max_comments_per_post comments # break # if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc: # comments.append(comment.body) # timestamps.append(pd.to_datetime(comment.created_utc, unit='s')) # comment_count += 1 # # Optional: Stop after reaching max_comments total # if len(comments) >= max_posts * max_comments_per_post: # break # return comments, timestamps # def preprocess_text(text): # text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower()) # text = re.sub(r'\s+', ' ', text) # return text.strip() # def analyze_sentiment_vader(comments): # return [analyzer.polarity_scores(comment)['compound'] for comment in comments] # def generate_wordcloud(comments): # filtered_words = ' '.join([word for word in ' '.join(comments).split() if word not in stop_words_list]) # return WordCloud(width=400, height=400, background_color='white').generate(filtered_words) # def extract_features(comments): # vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=50) # X = vectorizer.fit_transform(comments) # return vectorizer.get_feature_names_out(), X.sum(axis=0).A1 # query = st.text_input("Enter a keyword to search for Reddit comments", value="data analyst bootcamp online course") # start_date = st.date_input("Start Date", value=pd.to_datetime("2024-01-01").date()) # end_date = st.date_input("End Date", value=pd.to_datetime("today").date()) # if st.button("Analyze"): # comments, timestamps = fetch_reddit_data(query) # print('done fetching', len(comments)) # if not comments: # st.warning("No comments found for this search query.") # else: # cleaned_comments = [preprocess_text(comment) for comment in comments] # print('preprocessed') # sentiment_scores = analyze_sentiment(cleaned_comments) # df = pd.DataFrame({ # 'comment': cleaned_comments, # 'sentiment': sentiment_scores, # 'created_at': timestamps # }) # print('ANALYZED') # df['created_at'] = pd.to_datetime(df['created_at']) # df.set_index('created_at', inplace=True) # df = df[(df.index >= pd.Timestamp(start_date)) & (df.index <= pd.Timestamp(end_date))] # # Ensure the 'sentiment' column contains numeric values (floats) # df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce') # # Apply sentiment categorization # df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral') # #df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral') # st.session_state.df = df # st.session_state.cleaned_comments = cleaned_comments # if 'df' in st.session_state: # df = st.session_state.df # cleaned_comments = st.session_state.cleaned_comments # st.subheader("Key Metrics") # col1, col2, col3, col4 = st.columns(4) # col1.metric("Total Comments", len(df)) # col2.metric("Positive", len(df[df['sentiment_category'] == 'positive'])) # col3.metric("Neutral", len(df[df['sentiment_category'] == 'neutral'])) # col4.metric("Negative", len(df[df['sentiment_category'] == 'negative'])) # st.subheader("Sentiment Distribution") # sentiment_counts = df['sentiment_category'].value_counts() # fig1, ax1 = plt.subplots() # ax1.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90) # ax1.axis('equal') # st.subheader("Top Keywords Affecting Sentiment") # vectorizer = TfidfVectorizer(stop_words=stop_words_list, max_features=50) # X = vectorizer.fit_transform(df['comment']) # features = vectorizer.get_feature_names_out() # scores = np.asarray(X.mean(axis=0)).flatten() # keywords_df = pd.DataFrame({'Keyword': features, 'Score': scores}) # top_keywords = keywords_df.sort_values(by='Score', ascending=False).head(10) # fig2, ax2 = plt.subplots(figsize=(10, 5)) # top_keywords.plot(kind='bar', x='Keyword', y='Score', ax=ax2, color='darkorange') # ax2.set_xlabel("Keyword") # ax2.set_ylabel("TF-IDF Score") # ax2.set_title("Top Keywords Affecting Sentiment") # plt.xticks(rotation=45, ha='right') # col1, col2 = st.columns(2) # with col1: # st.pyplot(fig1) # with col2: # st.pyplot(fig2) # st.subheader("Word Cloud and Feature Importance Analysis") # col1, col2 = st.columns(2) # with col1: # filtered_words = ' '.join([word for word in ' '.join(cleaned_comments).split() if word.lower() not in stop_words_list]) # wordcloud = WordCloud(width=400, height=400, background_color='white').generate(filtered_words) # plt.figure(figsize=(5, 4)) # plt.imshow(wordcloud, interpolation='bilinear') # plt.axis('off') # st.pyplot(plt) # with col2: # feature_names, feature_counts = extract_features(cleaned_comments) # feature_df = pd.DataFrame({'Feature': feature_names, 'Count': feature_counts}) # feature_df = feature_df.sort_values(by='Count', ascending=False).head(10) # fig3, ax3 = plt.subplots(figsize=(10, 5)) # feature_df.plot(kind='bar', x='Feature', y='Count', ax=ax3, color='salmon') # ax3.set_xlabel("Feature") # ax3.set_ylabel("Frequency") # ax3.set_title("Top Keywords Impacting Sentiment") # plt.xticks(rotation=45, ha='right') # st.pyplot(fig3) # st.subheader("Sentiment Over Time") # sentiment_over_time = df['sentiment'].resample('W').mean() # fig4, ax4 = plt.subplots(figsize=(10, 5)) # ax4.plot(sentiment_over_time.index, sentiment_over_time.values, marker='o') # ax4.set_xlabel("Date") # ax4.set_ylabel("Average Sentiment") # st.pyplot(fig4) # st.subheader("Sentiment Distribution by Hour of Day") # df['hour'] = df.index.hour # sentiment_by_hour = df.groupby('hour')['sentiment'].mean() # fig5, ax5 = plt.subplots(figsize=(10, 5)) # ax5.plot(sentiment_by_hour.index, sentiment_by_hour.values, marker='o') # ax5.set_xlabel("Hour of Day") # ax5.set_ylabel("Average Sentiment") # st.pyplot(fig5) # st.subheader("Comment Summaries") # for comment in cleaned_comments: # st.write(summarize_text(comment))