#!/usr/bin/env python3

# from googleapiclient.discovery import build
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# import pandas as pd
# import streamlit as st
# import re

# # Set up the YouTube API client
# api_key = 'AIzaSyAtaMM03J79pb2vhBOvsIYMlQ84sx9Fb2U'  # Replace with your API key
# youtube = build('youtube', 'v3', developerKey=api_key)


# # Set up the Reddit API client (PRAW)
# reddit = praw.Reddit(
#     client_id='EhlUF9EavT4rAx42jQshKQ',  # Replace with your Reddit client_id
#     client_secret='Zwc8iLJN8saS3B6booPKjabXw63cZQ',  # Replace with your Reddit client_secret
#     user_agent='FondantOk6255'  # Replace with your user_agent
#)

import streamlit as st
st.set_page_config(page_title="Reddit Comment Analyzer", layout="wide")

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from nltk.corpus import stopwords
from collections import Counter
import praw
import nltk
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from transformers import pipeline

# Load sentiment analysis pipeline
sentiment_analyzer = pipeline('sentiment-analysis')

# Load summarization pipeline
summarizer = pipeline('summarization')

def analyze_sentiment(comment):
    if len(comment) <= 500:  # Skip long comments
        return sentiment_analyzer(comment)[0]['label']
    else:
        return 'neutral'

def summarize_text(text):
    if len(text) <= 500:  # Skip long text for summarization
        return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
    else:
        return text[:500] + '...'  # Return truncated version


# Access secrets using st.secrets
reddit_client_id = st.secrets["client_id"]
reddit_client_secret = st.secrets["client_secret"]
reddit_user_agent = st.secrets["user_agent"]


# Initialize Reddit API client using the API keys
reddit = praw.Reddit(
    client_id=reddit_client_id,
    client_secret=reddit_client_secret,
    user_agent=reddit_user_agent
)

# VADER sentiment analyzer setup
analyzer = SentimentIntensityAnalyzer()

# Download stopwords
nltk.download('stopwords')
stop_words_set = ['at', 'how', 'do', 'm', 'during', 'again', 'been', 'dont', 'itself', 'from', 'in', 
              'myself', "wouldn't", 'which', 'than', 'yourselves', 'her', 's', 'further', 'won', 'my', 
              'more', 'would', 'no', 'some', 'yours', "weren't", "haven't", 'over', 'couldn', 'against', 
              "mustn't", 'same', 'was', 'himself', "aren't", 'through', 'shan', 'he', "mightn't", 'only', 
              'on', 't', 'ourselves', 'these', 'other', 'up', 'about', 'hers', 'hasn', 'it', "doesn't", 
              'for', 'wouldn', 'doing', 'not', 'his', 'll', 'you', "couldn't", 'too', 'haven', 'those', 
              'our', 'because', 'im', 'know', 'until', 'to', 'mightn', 'such', 'very', 'needn', 'they', 
              'or', 'as', 'having', 'isn', 'here', 'didn', "isn't", "i'm", 'most', 'did', 'have', 
              "it's", "hadn't", 'by', 'has', 'into', 'there', 'yourself', 'had', 'am', 'y', 'just', 
              'don', 'are', 'does', 'like', 'whom', 'should', 'after', 'mustn', 'once', 'below', 
              'him', 'who', "you're", 'them', 'why', 'your', "you've", "you'll", 'is', "don't", 
              'aren', 'when', 'so', 'can', 'being', 'and', "should've", 'that', 'above', 
              "didn't", 'hadn', 'doesn', 've', 'ma', 'before', 'out', 'the', 'if', 'where', 
              "shan't", 'under', 'each', 'ain', 'what', "shouldn't", 'down', 'now', 'weren', 
              'youre', 'a', 'with', "hasn't", 'herself', 'get', 're', "she's", 'of', 'we', 
              "wasn't", 'their', 'theirs', 'but', 'o', "that'll", 'its', 'own', 'wasn', 
              'all', 'nor', "you'd", 'shouldn', 'both', 'me', 'd', 'between', 'be', 'an', 
              'any', 'i', 'she', 'this', 'then', "won't", 'were', 'will', "needn't", 'off', 
              'few', 'themselves', 'ours', 'while']

# Combine custom stopwords with NLTK stopwords
stop_words = list(set(stopwords.words('english')).union(stop_words_set))

# Set up the TfidfVectorizer using the combined stop words
# vectorizer = TfidfVectorizer(stop_words=list(stop_words))

# Convert the set to a list before passing to TfidfVectorizer
# stop_words_list = list(stop_words)

# Verify that stop_words_list is a list of strings
# st.write(stop_words_list[:10])  # Print first 10 stop words for verification

# Use the vectorizer and pass the stop words list
try:
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=5)
    print("Stop words applied:", vectorizer.get_stop_words())
    st.write("TfidfVectorizer initialized successfully!")
except Exception as e:
    st.error(f"Error initializing TfidfVectorizer: {e}")


# Streamlit app structure

st.title("Reddit Keyword-Based Comment Analyzer")

@st.cache
def fetch_reddit_data(query, max_results=50, min_score=10):
    posts = reddit.subreddit('all').search(query, limit=max_results)
    comments, timestamps, scores = [], [], []
    for post in posts:
        post.comments.replace_more(limit=0)
        for comment in post.comments.list():
            if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc:
                if comment.score >= min_score:  # Filter comments by minimum score (upvotes)
                    comments.append(comment.body)
                    timestamps.append(pd.to_datetime(comment.created_utc, unit='s'))
                    scores.append(comment.score)  # Store the comment score for reference
    return comments, timestamps, scores


def preprocess_text(text):
    # Normalize contractions and remove non-alphabetic characters
    text = re.sub(r"[^\w\s]", '', text.lower())  # Removes punctuation and converts to lower case
    text = re.sub(r'\s+', ' ', text)  # Removes excess whitespace
    return text

def analyze_sentiment(comments):
    return [analyzer.polarity_scores(comment)['compound'] for comment in comments]

def generate_wordcloud(comments):
    filtered_words = ' '.join([word for word in ' '.join(comments).split() if word not in stop_words])
    return WordCloud(width=400, height=400, background_color='white').generate(filtered_words)

# Extract features for keywords
def extract_features(comments):
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50)
    X = vectorizer.fit_transform(comments)
    return vectorizer.get_feature_names_out(), X.sum(axis=0).A1

query = st.text_input("Enter a keyword to search for Reddit comments", value="data analyst bootcamp online course")
start_date = st.date_input("Start Date", value=pd.to_datetime("2024-01-01").date())
end_date = st.date_input("End Date", value=pd.to_datetime("today").date())

print("=================")

if st.button("Analyze"):
    comments, timestamps, score = fetch_reddit_data(query, max_results=50, min_score=5)
    print(f'Fetched {len(comments)} comments.')
    if not comments:
        st.warning("No comments found for this search query.")
    else:
        # Clean the comments before passing them to TfidfVectorizer
        cleaned_comments = [preprocess_text(comment) for comment in comments]
        print("Sample of cleaned comments:")
        print(cleaned_comments[:5])
        sentiment_scores = analyze_sentiment(cleaned_comments)
        df = pd.DataFrame({
            'comment': cleaned_comments,
            'sentiment': sentiment_scores,
            'created_at': timestamps
        })

        # Ensure created_at is in datetime format
        df['created_at'] = pd.to_datetime(df['created_at'])

        # Set the datetime index for resampling
        df.set_index('created_at', inplace=True)

        # Filter by date range
        df = df[(df.index >= pd.Timestamp(start_date)) & (df.index <= pd.Timestamp(end_date))]

        df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')

         # Save results in session state
        st.session_state.df = df
        st.session_state.cleaned_comments = cleaned_comments

        # If results are in session state, retrieve them
        if 'df' in st.session_state:
            df = st.session_state.df
            cleaned_comments = st.session_state.cleaned_comments


        st.subheader("Key Metrics")
        col1, col2, col3, col4 = st.columns(4)
        col1.metric("Total Comments", len(df))
        col2.metric("Positive", len(df[df['sentiment_category'] == 'positive']))
        col3.metric("Neutral", len(df[df['sentiment_category'] == 'neutral']))
        col4.metric("Negative", len(df[df['sentiment_category'] == 'negative']))

        # Sentiment Distribution
        st.subheader("Sentiment Distribution")
        sentiment_counts = df['sentiment_category'].value_counts()
        fig1, ax1 = plt.subplots()
        ax1.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90)
        ax1.axis('equal')

        # Top Comments Distribution
        # Keywords Affecting Sentiment
        st.subheader("Top Keywords Affecting Sentiment")
        vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50)
        X = vectorizer.fit_transform(df['comment'])
        features = vectorizer.get_feature_names_out()
        scores = np.asarray(X.mean(axis=0)).flatten()

        keywords_df = pd.DataFrame({'Keyword': features, 'Score': scores})
        top_keywords = keywords_df.sort_values(by='Score', ascending=False).head(10)

        fig2, ax2 = plt.subplots(figsize=(10, 5))
        top_keywords.plot(kind='bar', x='Keyword', y='Score', ax=ax2, color='darkorange')
        ax2.set_xlabel("Keyword")
        ax2.set_ylabel("TF-IDF Score")
        ax2.set_title("Top Keywords Affecting Sentiment")
        plt.xticks(rotation=45, ha='right')

        # Display the charts side by side
        col1, col2 = st.columns(2)
        with col1:
            st.pyplot(fig1)
        with col2:
            st.pyplot(fig2)

        # Visualizations
        # Word cloud and common words bar chart side by side
        st.subheader("Word Cloud and Feature Importance Analysis")
        col1, col2 = st.columns(2)

        with col1:
            filtered_words = ' '.join([word for word in ' '.join(cleaned_comments).split() if word.lower() not in stop_words])
            wordcloud = WordCloud(width=400, height=400, background_color='white').generate(filtered_words)
            plt.figure(figsize=(5, 4))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            st.pyplot(plt)

        with col2:
            # Feature Importance Analysis
            feature_names, feature_counts = extract_features(cleaned_comments)
            feature_df = pd.DataFrame({'Feature': feature_names, 'Count': feature_counts})
            feature_df = feature_df.sort_values(by='Count', ascending=False).head(10)

            fig3, ax3 = plt.subplots(figsize=(10, 5))
            feature_plot = feature_df.plot(kind='bar', x='Feature', y='Count', ax=ax3, color='salmon')
            ax3.set_xlabel("Feature")
            ax3.set_ylabel("Frequency")
            ax3.set_title("Top Keywords Impacting Sentiment")
            plt.xticks(rotation=45, ha='right')
            st.pyplot(fig3)

        st.subheader("Sentiment Over Time")
        sentiment_over_time = df['sentiment'].resample('W').mean()  # Resample by week
        fig2, ax2 = plt.subplots(figsize=(10, 5))
        ax2.plot(sentiment_over_time.index, sentiment_over_time.values, marker='o')
        ax2.set_xlabel("Date")
        ax2.set_ylabel("Average Sentiment")
        st.pyplot(fig2)


        # Sentiment Distribution by Hour of Day
        st.subheader("Sentiment Distribution by Hour of Day")
        df['hour'] = df.index.hour
        sentiment_by_hour = df.groupby('hour')['sentiment'].mean()
        fig5, ax5 = plt.subplots(figsize=(10, 5))
        ax5.bar(sentiment_by_hour.index, sentiment_by_hour.values, color='skyblue')
        ax5.set_xlabel("Hour of Day")
        ax5.set_ylabel("Average Sentiment")
        ax5.set_title("Average Sentiment by Hour of Day")
        st.pyplot(fig5)

        # Sentiment Heatmap
        st.subheader("Sentiment Heatmap by Hour and Day")
        df['day_of_week'] = df.index.day_name()
        df['hour'] = df.index.hour
        heatmap_data = df.groupby(['day_of_week', 'hour'])['sentiment'].mean().unstack()
        fig4, ax4 = plt.subplots(figsize=(12, 6))
        sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt='.2f', ax=ax4)
        ax4.set_xlabel("Hour of Day")
        ax4.set_ylabel("Day of Week")
        ax4.set_title("Sentiment Heatmap by Hour and Day")
        st.pyplot(fig4)

        # Filter comments by sentiment
        st.subheader("Filter Comments by Sentiment")
        sentiment_option = st.selectbox("Choose Sentiment", ['positive', 'neutral', 'negative'])
        filtered_comments = df[df['sentiment_category'] == sentiment_option]
        if not filtered_comments.empty:
            st.write(filtered_comments[['comment', 'sentiment_category']].head())
        else:
            st.write("No comments found for the selected sentiment.")

        # Display raw data
        st.subheader("Raw Data")
        st.write(df.head())

        # Enhanced Top Sentiment-Related Keywords
        st.subheader("Top Sentiment-Related Keywords")
        keywords = ['excellent', 'good', 'great', 'bad', 'terrible']
        filtered_comments_with_keywords = [comment for comment in cleaned_comments if any(keyword in comment for keyword in keywords)]
        if filtered_comments_with_keywords:
            st.write(f"Found {len(filtered_comments_with_keywords)} comments containing sentiment-related keywords.")
            for i, comment in enumerate(filtered_comments_with_keywords[:10]):
                st.write(f"**Comment {i+1}:** {comment}")
        else:
            st.write("No comments with sentiment-related keywords found.")


# import streamlit as st
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# from wordcloud import WordCloud
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import re
# import numpy as np
# from nltk.corpus import stopwords
# from collections import Counter
# import praw
# import nltk
# from datetime import datetime
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from transformers import pipeline

# # Load sentiment analysis and summarization pipelines
# sentiment_analyzer = pipeline('sentiment-analysis')
# summarizer = pipeline('summarization')

# def analyze_sentiment(comment):
#     if len(comment) <= 500:  # Skip long comments
#         return sentiment_analyzer(comment)[0]['label']
#     else:
#         return 'neutral'

# def summarize_text(text):
#     if len(text) <= 500:  # Skip long text for summarization
#         return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
#     else:
#         return text[:500] + '...'  # Return truncated version

# # Initialize Reddit API client
# reddit = praw.Reddit(
#     client_id='EhlUF9EavT4rAx42jQshKQ',
#     client_secret='Zwc8iLJN8saS3B6booPKjabXw63cZQ',
#     user_agent='FondantOk6255'
# )

# # VADER sentiment analyzer setup
# analyzer = SentimentIntensityAnalyzer()

# # Download stopwords
# nltk.download('stopwords')
# stop_words_set = set(stopwords.words('english'))
# stop_words_list = list(stop_words_set)

# vectorizer = TfidfVectorizer(stop_words=stop_words_list)

# # Streamlit app structure
# st.set_page_config(page_title="Reddit Comment Analyzer", layout="wide")
# st.title("Reddit Keyword-Based Comment Analyzer")

# # @st.cache
# # def fetch_reddit_data(query, max_results=50):
# #     posts = reddit.subreddit('all').search(query, limit=max_results)
# #     comments, timestamps = [], []
# #     for post in posts:
# #         post.comments.replace_more(limit=0)
# #         for comment in post.comments.list():
# #             if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc:
# #                 comments.append(comment.body)
# #                 timestamps.append(pd.to_datetime(comment.created_utc, unit='s'))
# #     return comments, timestamps

# @st.cache
# def fetch_reddit_data(query, max_posts=50, max_comments_per_post=10):
#     posts = reddit.subreddit('all').search(query, limit=max_posts)
#     comments, timestamps = [], []

#     for post in posts:
#         post.comments.replace_more(limit=0)  # Replace MoreComments with actual comments
#         comment_count = 0
#         for comment in post.comments.list():
#             if comment_count >= max_comments_per_post:  # Stop after max_comments_per_post comments
#                 break
#             if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc:
#                 comments.append(comment.body)
#                 timestamps.append(pd.to_datetime(comment.created_utc, unit='s'))
#                 comment_count += 1

#         # Optional: Stop after reaching max_comments total
#         if len(comments) >= max_posts * max_comments_per_post:
#             break
    
#     return comments, timestamps


# def preprocess_text(text):
#     text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
#     text = re.sub(r'\s+', ' ', text)
#     return text.strip()

# def analyze_sentiment_vader(comments):
#     return [analyzer.polarity_scores(comment)['compound'] for comment in comments]

# def generate_wordcloud(comments):
#     filtered_words = ' '.join([word for word in ' '.join(comments).split() if word not in stop_words_list])
#     return WordCloud(width=400, height=400, background_color='white').generate(filtered_words)

# def extract_features(comments):
#     vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=50)
#     X = vectorizer.fit_transform(comments)
#     return vectorizer.get_feature_names_out(), X.sum(axis=0).A1

# query = st.text_input("Enter a keyword to search for Reddit comments", value="data analyst bootcamp online course")
# start_date = st.date_input("Start Date", value=pd.to_datetime("2024-01-01").date())
# end_date = st.date_input("End Date", value=pd.to_datetime("today").date())

# if st.button("Analyze"):
#     comments, timestamps = fetch_reddit_data(query)

#     print('done fetching', len(comments))

#     if not comments:
#         st.warning("No comments found for this search query.")
#     else:
#         cleaned_comments = [preprocess_text(comment) for comment in comments]
#         print('preprocessed')
#         sentiment_scores = analyze_sentiment(cleaned_comments)
#         df = pd.DataFrame({
#             'comment': cleaned_comments,
#             'sentiment': sentiment_scores,
#             'created_at': timestamps
#         })
#         print('ANALYZED')

#         df['created_at'] = pd.to_datetime(df['created_at'])
#         df.set_index('created_at', inplace=True)
#         df = df[(df.index >= pd.Timestamp(start_date)) & (df.index <= pd.Timestamp(end_date))]

#         # Ensure the 'sentiment' column contains numeric values (floats)
#         df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

#         # Apply sentiment categorization
#         df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')

#         #df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')

#         st.session_state.df = df
#         st.session_state.cleaned_comments = cleaned_comments

#         if 'df' in st.session_state:
#             df = st.session_state.df
#             cleaned_comments = st.session_state.cleaned_comments

#         st.subheader("Key Metrics")
#         col1, col2, col3, col4 = st.columns(4)
#         col1.metric("Total Comments", len(df))
#         col2.metric("Positive", len(df[df['sentiment_category'] == 'positive']))
#         col3.metric("Neutral", len(df[df['sentiment_category'] == 'neutral']))
#         col4.metric("Negative", len(df[df['sentiment_category'] == 'negative']))

#         st.subheader("Sentiment Distribution")
#         sentiment_counts = df['sentiment_category'].value_counts()
#         fig1, ax1 = plt.subplots()
#         ax1.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90)
#         ax1.axis('equal')

#         st.subheader("Top Keywords Affecting Sentiment")
#         vectorizer = TfidfVectorizer(stop_words=stop_words_list, max_features=50)
#         X = vectorizer.fit_transform(df['comment'])
#         features = vectorizer.get_feature_names_out()
#         scores = np.asarray(X.mean(axis=0)).flatten()

#         keywords_df = pd.DataFrame({'Keyword': features, 'Score': scores})
#         top_keywords = keywords_df.sort_values(by='Score', ascending=False).head(10)

#         fig2, ax2 = plt.subplots(figsize=(10, 5))
#         top_keywords.plot(kind='bar', x='Keyword', y='Score', ax=ax2, color='darkorange')
#         ax2.set_xlabel("Keyword")
#         ax2.set_ylabel("TF-IDF Score")
#         ax2.set_title("Top Keywords Affecting Sentiment")
#         plt.xticks(rotation=45, ha='right')

#         col1, col2 = st.columns(2)
#         with col1:
#             st.pyplot(fig1)
#         with col2:
#             st.pyplot(fig2)

#         st.subheader("Word Cloud and Feature Importance Analysis")
#         col1, col2 = st.columns(2)

#         with col1:
#             filtered_words = ' '.join([word for word in ' '.join(cleaned_comments).split() if word.lower() not in stop_words_list])
#             wordcloud = WordCloud(width=400, height=400, background_color='white').generate(filtered_words)
#             plt.figure(figsize=(5, 4))
#             plt.imshow(wordcloud, interpolation='bilinear')
#             plt.axis('off')
#             st.pyplot(plt)

#         with col2:
#             feature_names, feature_counts = extract_features(cleaned_comments)
#             feature_df = pd.DataFrame({'Feature': feature_names, 'Count': feature_counts})
#             feature_df = feature_df.sort_values(by='Count', ascending=False).head(10)

#             fig3, ax3 = plt.subplots(figsize=(10, 5))
#             feature_df.plot(kind='bar', x='Feature', y='Count', ax=ax3, color='salmon')
#             ax3.set_xlabel("Feature")
#             ax3.set_ylabel("Frequency")
#             ax3.set_title("Top Keywords Impacting Sentiment")
#             plt.xticks(rotation=45, ha='right')
#             st.pyplot(fig3)

#         st.subheader("Sentiment Over Time")
#         sentiment_over_time = df['sentiment'].resample('W').mean()
#         fig4, ax4 = plt.subplots(figsize=(10, 5))
#         ax4.plot(sentiment_over_time.index, sentiment_over_time.values, marker='o')
#         ax4.set_xlabel("Date")
#         ax4.set_ylabel("Average Sentiment")
#         st.pyplot(fig4)

#         st.subheader("Sentiment Distribution by Hour of Day")
#         df['hour'] = df.index.hour
#         sentiment_by_hour = df.groupby('hour')['sentiment'].mean()
#         fig5, ax5 = plt.subplots(figsize=(10, 5))
#         ax5.plot(sentiment_by_hour.index, sentiment_by_hour.values, marker='o')
#         ax5.set_xlabel("Hour of Day")
#         ax5.set_ylabel("Average Sentiment")
#         st.pyplot(fig5)

#         st.subheader("Comment Summaries")
#         for comment in cleaned_comments:
#             st.write(summarize_text(comment))