Spaces:

Bayhaqy
/

Classification-News-Analysis-and-Prediction

Sleeping

App Files Files Community

Bayhaqy commited on Oct 14, 2023

Commit

65d3732

•

1 Parent(s): 82508ca

Create 📱_X_Scrapping.py

Browse files

Files changed (1) hide show

pages/📱_X_Scrapping.py +534 -0

pages/📱_X_Scrapping.py ADDED Viewed

	@@ -0,0 +1,534 @@

+# Data Analysis and Profiling
+import pandas as pd
+from ydata_profiling import ProfileReport
+from streamlit_pandas_profiling import st_profile_report
+# Streamlit for Building the Dashboard
+import streamlit as st
+import streamlit_pandas_profiling
+# Language Detection
+from langdetect import detect
+# NLP and Text Processing
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from deep_translator import GoogleTranslator
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from bs4 import BeautifulSoup
+# Sentiment Analysis
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from textblob import TextBlob
+# URL Parsing
+from urllib.parse import urlparse
+# Data Visualization
+import plotly.express as px
+import matplotlib.pyplot as plt
+# Word Cloud Generation
+from wordcloud import WordCloud
+# Other Libraries
+import torch
+import requests
+import subprocess
+import logging
+import re
+import os
+# NLTK Data Download
+nltk.download('wordnet')
+nltk.download('punkt')
+## ............................................... ##
+# Set page configuration (Call this once and make changes as needed)
+st.set_page_config(page_title='(Tweet) X Scrapper Dashboard',  layout='wide', page_icon=':rocket:')
+## ............................................... ##
+with st.container():
+  # Define Streamlit app title and introduction
+  st.title("(Tweet) X Scrapper Dashboard")
+  st.write("Created by Bayhaqy")
+# Sidebar content
+st.sidebar.subheader("About the app")
+st.sidebar.info("This app allows you to get data, analysis and prediction with the (Tweet) X Scrapper tool.")
+url = "https://blogs.bayhaqy.my.id/2023/10/auth-token-twitter.html"
+st.sidebar.markdown("check this [link](%s) for guides on how to get your own X Auth Token" % url)
+st.sidebar.write("\n\n")
+st.sidebar.markdown("**Please contact me if you have any questions**")
+st.sidebar.write("\n\n")
+st.sidebar.divider()
+st.sidebar.markdown("© 2023 (Tweet) X Scrapper Dashboard")
+## ............................................... ##
+# Function to install Node.js
+@st.cache_data
+def install_nodejs():
+    node_major_version = int(subprocess.check_output(['node', '-v']).decode("utf-8").split('.')[0][1:])
+    if node_major_version < 20:
+        #st.markdown('Update OS')
+        subprocess.check_call(['sudo', 'apt-get', 'update'])
+        st.markdown('Download Files Requirement for Nodesource')
+        subprocess.check_call(['sudo', 'apt-get', 'install', '-y', 'ca-certificates', 'curl', 'gnupg'])
+        subprocess.check_call(['sudo', 'mkdir', '-p', '/etc/apt/keyrings'])
+        subprocess.check_call(f'curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg', shell=True)
+        NODE_MAJOR = 20
+        node_source_entry = f"deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_{NODE_MAJOR}.x nodistro main"
+        subprocess.check_call(f'echo "{node_source_entry}" | sudo tee /etc/apt/sources.list.d/nodesource.list', shell=True)
+        #st.markdown('Install Node.js')
+        subprocess.check_call(['sudo', 'apt-get', 'update'])
+        subprocess.check_call(['sudo', 'apt-get', 'install', 'nodejs', '-y'])
+        result = subprocess.check_output(['node', '-v']).decode("utf-8")
+        #st.markdown(f'Node.js version: {result}')
+    else:
+        #st.markdown('Node.js version already installed')
+        result = subprocess.check_output(['node', '-v']).decode("utf-8")
+        #st.markdown(f'Node.js version already updated to {result}')
+## ............................................... ##
+# Function to run tweet-harvest
+@st.cache_data
+def run_X_scrapping(search_keyword,from_date,to_date,limit,delay,token,filename):
+    # Run tweet-harvest with the provided parameters
+    #st.markdown('Check Tweet')
+    command = f'npx --yes tweet-harvest@latest -s "{search_keyword}" -f "{from_date}" -t "{to_date}" -l {limit} -d {delay} --token "{token}" -o "{filename}"'
+    try:
+        result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
+        #st.markdown("Command executed successfully.")
+        #st.markdown(result.stdout)  # Display the standard output, give comment if you don't want to see
+    except subprocess.CalledProcessError as e:
+        st.markdown("Error: The command returned a non-zero exit status.")
+        st.markdown("Error message:", e)
+        st.markdown(f'Standard output: {e.stdout}')
+        st.markdown(f'Standard error: {e.stderr}')
+## ............................................... ##
+# Function for get model and tokenize
+@st.cache_resource
+def get_models_and_tokenizers():
+    model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
+    #model.eval()
+    return model, tokenizer
+## ............................................... ##
+# Function for sentiment analysis
+@st.cache_resource
+def analyze_sentiment_distilbert(text, _model, _tokenizer):
+    try:
+        tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
+        with torch.no_grad():
+            raw_predictions = _model(**tokens_info).logits
+        predicted_class_id = raw_predictions.argmax().item()
+        predict = _model.config.id2label[predicted_class_id]
+        softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
+        if (softmaxed > 70):
+            status = 'Not trust'
+        elif (softmaxed > 40):
+            status = 'Not sure'
+        else:
+            status = 'Trust'
+        return status, predict
+    except Exception as e:
+        logging.error(f"Sentiment analysis error: {str(e)}")
+        return 'N/A', 'N/A'
+## ............................................... ##
+# Function for sentiment analysis using VADER
+@st.cache_resource
+def analyze_sentiment_vader(text):
+    analyzer = SentimentIntensityAnalyzer()
+    sentiment = analyzer.polarity_scores(text)
+    compound_score = sentiment['compound']
+    if compound_score >= 0.05:
+        return 'Positive'
+    elif compound_score <= -0.05:
+        return 'Negative'
+    else:
+        return 'Neutral'
+## ............................................... ##
+# Function for sentiment analysis using TextBlob
+@st.cache_resource
+def analyze_sentiment_textblob(text):
+    analysis = TextBlob(text)
+    polarity = analysis.sentiment.polarity
+    if polarity > 0:
+        return 'Positive'
+    elif polarity < 0:
+        return 'Negative'
+    else:
+        return 'Neutral'
+## ............................................... ##
+# Function for translation
+@st.cache_data
+def translate_text(text, source='auto', target='en'):
+    try:
+        if source != target:
+            text = GoogleTranslator(source=source, target=target).translate(text)
+        return text
+    except Exception as e:
+        logging.error(f"Translation error: {str(e)}")
+        return text
+## ............................................... ##
+# Function for Load and Transform Data
+@st.cache_data
+def selection_data(filename):
+    file_path = f"tweets-data/{filename}"
+    df = pd.read_csv(file_path, delimiter=";")
+    # Rename columns
+    column_mapping = {
+        'created_at': 'Created Date',
+        'user_id_str': 'User ID',
+        'username': 'Username',
+        'full_text': 'Tweet',
+        'tweet_url': 'Tweet URL',
+        'id_str': 'Tweet ID',
+        'conversation_id_str': 'Conversation ID',
+        'lang': 'App Language',
+        'quote_count': 'Quote Count',
+        'reply_count': 'Reply Count',
+        'retweet_count': 'Retweet Count',
+        'favorite_count': 'Favorite Count',
+    }
+    df = df.rename(columns=column_mapping)
+    # Add a new column for detected language
+    df['Detect Language'] = df['Tweet'].apply(lambda tweet: detect(tweet))
+    # Mapping language codes to country names
+    language_to_country = {
+        'af': 'South Africa',
+        'ar': 'Arabic',
+        'bg': 'Bulgaria',
+        'bn': 'Bangladesh',
+        'ca': 'Catalan',
+        'cs': 'Czech',
+        'cy': 'Welsh',
+        'da': 'Danish',
+        'de': 'German',
+        'el': 'Greek',
+        'en': 'English',
+        'es': 'Spanish',
+        'et': 'Estonian',
+        'fa': 'Persian',
+        'fi': 'Finnish',
+        'fr': 'French',
+        'gu': 'Gujarati',
+        'he': 'Hebrew',
+        'hi': 'Hindi',
+        'hr': 'Croatian',
+        'hu': 'Hungarian',
+        'id': 'Indonesian',
+        'it': 'Italian',
+        'ja': 'Japanese',
+        'kn': 'Kannada',
+        'ko': 'Korean',
+        'lt': 'Lithuanian',
+        'lv': 'Latvian',
+        'mk': 'Macedonian',
+        'ml': 'Malayalam',
+        'mr': 'Marathi',
+        'ne': 'Nepali',
+        'nl': 'Dutch',
+        'no': 'Norwegian',
+        'pa': 'Punjabi',
+        'pl': 'Polish',
+        'pt': 'Portuguese',
+        'ro': 'Romanian',
+        'ru': 'Russian',
+        'sk': 'Slovak',
+        'sl': 'Slovenian',
+        'so': 'Somali',
+        'sq': 'Albanian',
+        'sv': 'Swedish',
+        'sw': 'Swahili',
+        'ta': 'Tamil',
+        'te': 'Telugu',
+        'th': 'Thai',
+        'tl': 'Tagalog',
+        'tr': 'Turkish',
+        'uk': 'Ukrainian',
+        'ur': 'Urdu',
+        'vi': 'Vietnamese',
+        'zh-cn': 'Simplified Chinese',
+        'zh-tw': 'Traditional Chinese'
+    }
+    # Add 'Country' column to df
+    df['Language'] = df['Detect Language'].map(language_to_country)
+    # Sort columns
+    desired_columns = ['Created Date', 'User ID', 'Username', 'Tweet', 'Language', 'Detect Language', 'App Language', 'Tweet URL', 'Tweet ID', 'Conversation ID', 'Quote Count', 'Reply Count', 'Retweet Count', 'Favorite Count']
+    df = df[desired_columns]
+    # Set data types
+    data_types = {
+        'Created Date': 'datetime64[ns]',
+        'User ID': 'int64',
+        'Username': 'object',
+        'Tweet': 'object',
+        'Language': 'object',
+        'Detect Language': 'object',
+        'App Language': 'object',
+        'Tweet URL': 'object',
+        'Tweet ID': 'int64',
+        'Conversation ID': 'int64',
+        'Quote Count': 'int64',
+        'Reply Count': 'int64',
+        'Retweet Count': 'int64',
+        'Favorite Count': 'int64',
+    }
+    df = df.astype(data_types)
+    return df
+## ............................................... ##
+# Function to preprocess the data
+@st.cache_data
+def preprocessing_data(df):
+    # Remove duplicates
+    df = df.drop_duplicates(subset='Translation')
+    # Function to clean and preprocess text
+    def clean_text(text):
+        # Remove mentions (e.g., @username)
+        text = re.sub(r'@[\w]+', '', text)
+        # Remove URLs
+        text = re.sub(r'http\S+', '', text)
+        # Remove HTML tags
+        text = BeautifulSoup(text, 'html.parser').get_text()
+        # Convert to lowercase
+        text = text.lower()
+        # Remove non-alphanumeric characters
+        text = re.sub(r'[^a-zA-Z\s]', '', text)
+        # Tokenize text
+        words = nltk.word_tokenize(text)
+        # Remove stopwords
+        stop_words = set(stopwords.words('english'))
+        words = [word for word in words if word not in stop_words]
+        # Lemmatize words
+        lemmatizer = WordNetLemmatizer()
+        words = [lemmatizer.lemmatize(word) for word in words]
+        return ' '.join(words)
+    # Apply the clean_text function to the "Translation" column
+    df['Cleaned Translation'] = df['Translation'].apply(clean_text)
+    return df
+## ............................................... ##
+# Function to create a Word Cloud
+@st.cache_data
+def create_wordcloud(df):
+    # Combine all text
+    text = ' '.join(df['Cleaned Translation'])
+    # Create a Word Cloud
+    wordcloud = WordCloud(width=700, height=400, max_words=50).generate(text)
+    # Convert the word cloud to an image
+    wordcloud_image = wordcloud.to_image()
+    # Display the Word Cloud using st.image
+    st.write("word Cloud by Tweets")
+    st.image(wordcloud_image, use_column_width=True)
+## ............................................... ##
+# IMPORTANT: Cache the conversion to prevent computation on every rerun
+@st.cache_data
+def convert_df(df):
+    return df.to_csv().encode('utf-8')
+## ............................................... ##
+# Set up logging
+logging.basicConfig(filename='tweet_harvest.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+## ............................................... ##
+with st.container():
+  # Input search parameters
+  search_keyword = st.text_input("Enter search keyword", "Jakarta",)
+  col1, col2 = st.columns(2)
+  with col1:
+    from_date = st.date_input('From Date :', pd.to_datetime('2023-01-01'))
+    to_date = st.date_input('To Date :', pd.to_datetime('2023-12-01'))
+  with col2:
+    limit = st.number_input("Enter limit", min_value=10, value=10, max_value=100)
+    delay = st.number_input("Enter delay in seconds", min_value=1, value=3)
+  token = st.text_input("Enter your X Auth Token", type="password")
+## ............................................... ##
+with st.container():
+  col1, col2 = st.columns(2)
+  with col1:
+    # Checkbox options for different processing steps
+    include_translation = st.checkbox("Include Translation", value=False)
+    include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=False)
+  with col2:
+    include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=False)
+    include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=False)
+## ............................................... ##
+# Initialize to install node.js
+install_nodejs()
+# Initialize model and tokenizer
+model, tokenizer = get_models_and_tokenizers()
+# Create a variable to track whether the data has been processed
+data_processed = False
+## ............................................... ##
+# Create a button to trigger tweet-harvest
+with st.container():
+    if st.button("Run it"):
+        # Format the dates as "DD-MM-YYYY"
+        from_date = from_date.strftime("%d-%m-%Y")
+        to_date = to_date.strftime("%d-%m-%Y")
+        filename = 'tweets_data.csv'
+        run_X_scrapping(search_keyword,from_date,to_date,limit,delay,token,filename)
+        df = selection_data(filename)
+        # Conditionally apply translation function to the 'Translation' column
+        if include_translation:
+            df['Translation'] = df.apply(lambda row: translate_text((row['Tweet']), source=row['Detect Language'], target='en'), axis=1)
+            df = preprocessing_data(df)
+        # Conditionally apply sentiment analysis function to the 'Translation' column
+        if include_sentiment_analysis:
+            df[['Fake Check', 'Sentiment Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer))).apply(lambda x: x.str.title())
+        # Conditionally apply VADER sentiment analysis to the 'Translation' column
+        if include_sentiment_vader:
+            df['Sentiment VADER'] = df['Translation'].apply(analyze_sentiment_vader)
+        # Conditionally apply TextBlob sentiment analysis to the 'Translation' column
+        if include_sentiment_textblob:
+            df['Sentiment TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
+        # Set data_processed to True when the data has been successfully processed
+        data_processed = True
+    ## ............................................... ##
+    # Add a button to download the data as a CSV file
+    if data_processed:
+        st.markdown("### Download Processed Data as CSV")
+        st.write("Click the button below to download the processed data as a CSV file.")
+        csv_data = convert_df(df)
+        # Create a downloadable link
+        st.download_button(
+            label="Download data as CSV",
+            data=csv_data,
+            file_name='processed_data.csv',
+            mime='text/csv',
+        )
+        with st.expander("See Table"):
+            ## ............................................... ##
+            # Display processed data
+            st.dataframe(df)
+        # Display processed data
+        with st.expander("See EDA"):
+            ## ............................................... ##
+            # Create a Streamlit app
+            st.subheader("Tweet Data Visualization")
+            col1, col2 = st.columns(2)
+            with col1:
+                ## ............................................... ##
+                # Create a new column with a count of 1 for each tweet
+                df_date = pd.DataFrame(df['Created Date'])
+                df_date['Tweet Count'] = 1
+                # Resample the data per second and calculate the count
+                data_resampled = df_date.resample('S', on='Created Date')['Tweet Count'].count().reset_index()
+                # Create a time series plot with custom styling
+                fig = px.line(data_resampled, x='Created Date', y='Tweet Count', title='Tweet Counts Over Time')
+                fig.update_xaxes(title_text='Time')
+                fig.update_yaxes(title_text='Tweet Count')
+                fig.update_layout(xaxis_rangeslider_visible=True)
+                # Specify custom dimensions for the chart
+                st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
+                ## ............................................... ##
+                # Group by Sentiment columns and get the count
+                sentiment_counts = df[['Sentiment Distilbert', 'Sentiment VADER', 'Sentiment TextBlob']].apply(lambda x: x.value_counts()).T
+                # Reset index to get Sentiment as a column
+                sentiment_counts = sentiment_counts.reset_index()
+                # Melt the DataFrame for easier plotting
+                sentiment_counts = pd.melt(sentiment_counts, id_vars='index', var_name='Sentiment', value_name='Count')
+                # Create the plot
+                fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='index', barmode='group', title='Total Tweet per Sentiment')
+                # Specify custom dimensions for the chart
+                st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
+            with col2:
+                ## ............................................... ##
+                # Create a DataFrame to count the number of tweets by language
+                language_counts = df['Language'].value_counts().reset_index()
+                language_counts.columns = ['Language', 'Tweet Count']
+                # Create an attractive Plotly bar chart
+                fig = px.bar(language_counts, x='Language', y='Tweet Count', text='Tweet Count', title='Total Tweet by Language')
+                fig.update_xaxes(title_text='Language')
+                fig.update_yaxes(title_text='Total Tweet')
+                # Specify custom dimensions for the chart
+                st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
+                ## ............................................... ##
+                # Create wordcloud
+                create_wordcloud(df)
+            ## ............................................... ##
+            # Show dataset information
+            pr = ProfileReport(df)
+            st_profile_report(pr)