Create π±_X_Scrapping.py
Browse files- pages/π±_X_Scrapping.py +534 -0
pages/π±_X_Scrapping.py
ADDED
@@ -0,0 +1,534 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Data Analysis and Profiling
|
2 |
+
import pandas as pd
|
3 |
+
from ydata_profiling import ProfileReport
|
4 |
+
from streamlit_pandas_profiling import st_profile_report
|
5 |
+
|
6 |
+
# Streamlit for Building the Dashboard
|
7 |
+
import streamlit as st
|
8 |
+
import streamlit_pandas_profiling
|
9 |
+
|
10 |
+
# Language Detection
|
11 |
+
from langdetect import detect
|
12 |
+
|
13 |
+
# NLP and Text Processing
|
14 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
15 |
+
from deep_translator import GoogleTranslator
|
16 |
+
import nltk
|
17 |
+
from nltk.corpus import stopwords
|
18 |
+
from nltk.stem import WordNetLemmatizer
|
19 |
+
from bs4 import BeautifulSoup
|
20 |
+
|
21 |
+
# Sentiment Analysis
|
22 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
23 |
+
from textblob import TextBlob
|
24 |
+
|
25 |
+
# URL Parsing
|
26 |
+
from urllib.parse import urlparse
|
27 |
+
|
28 |
+
# Data Visualization
|
29 |
+
import plotly.express as px
|
30 |
+
import matplotlib.pyplot as plt
|
31 |
+
|
32 |
+
# Word Cloud Generation
|
33 |
+
from wordcloud import WordCloud
|
34 |
+
|
35 |
+
# Other Libraries
|
36 |
+
import torch
|
37 |
+
import requests
|
38 |
+
import subprocess
|
39 |
+
import logging
|
40 |
+
import re
|
41 |
+
import os
|
42 |
+
|
43 |
+
# NLTK Data Download
|
44 |
+
nltk.download('wordnet')
|
45 |
+
nltk.download('punkt')
|
46 |
+
|
47 |
+
## ............................................... ##
|
48 |
+
# Set page configuration (Call this once and make changes as needed)
|
49 |
+
st.set_page_config(page_title='(Tweet) X Scrapper Dashboard', layout='wide', page_icon=':rocket:')
|
50 |
+
|
51 |
+
|
52 |
+
## ............................................... ##
|
53 |
+
with st.container():
|
54 |
+
# Define Streamlit app title and introduction
|
55 |
+
st.title("(Tweet) X Scrapper Dashboard")
|
56 |
+
st.write("Created by Bayhaqy")
|
57 |
+
|
58 |
+
# Sidebar content
|
59 |
+
st.sidebar.subheader("About the app")
|
60 |
+
st.sidebar.info("This app allows you to get data, analysis and prediction with the (Tweet) X Scrapper tool.")
|
61 |
+
|
62 |
+
url = "https://blogs.bayhaqy.my.id/2023/10/auth-token-twitter.html"
|
63 |
+
st.sidebar.markdown("check this [link](%s) for guides on how to get your own X Auth Token" % url)
|
64 |
+
|
65 |
+
st.sidebar.write("\n\n")
|
66 |
+
st.sidebar.markdown("**Please contact me if you have any questions**")
|
67 |
+
st.sidebar.write("\n\n")
|
68 |
+
st.sidebar.divider()
|
69 |
+
st.sidebar.markdown("Β© 2023 (Tweet) X Scrapper Dashboard")
|
70 |
+
|
71 |
+
## ............................................... ##
|
72 |
+
# Function to install Node.js
|
73 |
+
@st.cache_data
|
74 |
+
def install_nodejs():
|
75 |
+
node_major_version = int(subprocess.check_output(['node', '-v']).decode("utf-8").split('.')[0][1:])
|
76 |
+
|
77 |
+
if node_major_version < 20:
|
78 |
+
#st.markdown('Update OS')
|
79 |
+
subprocess.check_call(['sudo', 'apt-get', 'update'])
|
80 |
+
|
81 |
+
st.markdown('Download Files Requirement for Nodesource')
|
82 |
+
subprocess.check_call(['sudo', 'apt-get', 'install', '-y', 'ca-certificates', 'curl', 'gnupg'])
|
83 |
+
subprocess.check_call(['sudo', 'mkdir', '-p', '/etc/apt/keyrings'])
|
84 |
+
subprocess.check_call(f'curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg', shell=True)
|
85 |
+
|
86 |
+
NODE_MAJOR = 20
|
87 |
+
node_source_entry = f"deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_{NODE_MAJOR}.x nodistro main"
|
88 |
+
subprocess.check_call(f'echo "{node_source_entry}" | sudo tee /etc/apt/sources.list.d/nodesource.list', shell=True)
|
89 |
+
|
90 |
+
#st.markdown('Install Node.js')
|
91 |
+
subprocess.check_call(['sudo', 'apt-get', 'update'])
|
92 |
+
subprocess.check_call(['sudo', 'apt-get', 'install', 'nodejs', '-y'])
|
93 |
+
|
94 |
+
result = subprocess.check_output(['node', '-v']).decode("utf-8")
|
95 |
+
#st.markdown(f'Node.js version: {result}')
|
96 |
+
else:
|
97 |
+
#st.markdown('Node.js version already installed')
|
98 |
+
result = subprocess.check_output(['node', '-v']).decode("utf-8")
|
99 |
+
#st.markdown(f'Node.js version already updated to {result}')
|
100 |
+
|
101 |
+
## ............................................... ##
|
102 |
+
# Function to run tweet-harvest
|
103 |
+
@st.cache_data
|
104 |
+
def run_X_scrapping(search_keyword,from_date,to_date,limit,delay,token,filename):
|
105 |
+
# Run tweet-harvest with the provided parameters
|
106 |
+
#st.markdown('Check Tweet')
|
107 |
+
command = f'npx --yes tweet-harvest@latest -s "{search_keyword}" -f "{from_date}" -t "{to_date}" -l {limit} -d {delay} --token "{token}" -o "{filename}"'
|
108 |
+
try:
|
109 |
+
result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
|
110 |
+
#st.markdown("Command executed successfully.")
|
111 |
+
#st.markdown(result.stdout) # Display the standard output, give comment if you don't want to see
|
112 |
+
except subprocess.CalledProcessError as e:
|
113 |
+
st.markdown("Error: The command returned a non-zero exit status.")
|
114 |
+
st.markdown("Error message:", e)
|
115 |
+
st.markdown(f'Standard output: {e.stdout}')
|
116 |
+
st.markdown(f'Standard error: {e.stderr}')
|
117 |
+
|
118 |
+
## ............................................... ##
|
119 |
+
# Function for get model and tokenize
|
120 |
+
@st.cache_resource
|
121 |
+
def get_models_and_tokenizers():
|
122 |
+
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
|
123 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
124 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
|
125 |
+
#model.eval()
|
126 |
+
|
127 |
+
return model, tokenizer
|
128 |
+
|
129 |
+
## ............................................... ##
|
130 |
+
# Function for sentiment analysis
|
131 |
+
@st.cache_resource
|
132 |
+
def analyze_sentiment_distilbert(text, _model, _tokenizer):
|
133 |
+
try:
|
134 |
+
tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
|
135 |
+
with torch.no_grad():
|
136 |
+
raw_predictions = _model(**tokens_info).logits
|
137 |
+
|
138 |
+
predicted_class_id = raw_predictions.argmax().item()
|
139 |
+
predict = _model.config.id2label[predicted_class_id]
|
140 |
+
|
141 |
+
softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
|
142 |
+
if (softmaxed > 70):
|
143 |
+
status = 'Not trust'
|
144 |
+
elif (softmaxed > 40):
|
145 |
+
status = 'Not sure'
|
146 |
+
else:
|
147 |
+
status = 'Trust'
|
148 |
+
return status, predict
|
149 |
+
|
150 |
+
except Exception as e:
|
151 |
+
logging.error(f"Sentiment analysis error: {str(e)}")
|
152 |
+
return 'N/A', 'N/A'
|
153 |
+
|
154 |
+
## ............................................... ##
|
155 |
+
# Function for sentiment analysis using VADER
|
156 |
+
@st.cache_resource
|
157 |
+
def analyze_sentiment_vader(text):
|
158 |
+
analyzer = SentimentIntensityAnalyzer()
|
159 |
+
sentiment = analyzer.polarity_scores(text)
|
160 |
+
compound_score = sentiment['compound']
|
161 |
+
if compound_score >= 0.05:
|
162 |
+
return 'Positive'
|
163 |
+
elif compound_score <= -0.05:
|
164 |
+
return 'Negative'
|
165 |
+
else:
|
166 |
+
return 'Neutral'
|
167 |
+
|
168 |
+
## ............................................... ##
|
169 |
+
# Function for sentiment analysis using TextBlob
|
170 |
+
@st.cache_resource
|
171 |
+
def analyze_sentiment_textblob(text):
|
172 |
+
analysis = TextBlob(text)
|
173 |
+
polarity = analysis.sentiment.polarity
|
174 |
+
if polarity > 0:
|
175 |
+
return 'Positive'
|
176 |
+
elif polarity < 0:
|
177 |
+
return 'Negative'
|
178 |
+
else:
|
179 |
+
return 'Neutral'
|
180 |
+
|
181 |
+
## ............................................... ##
|
182 |
+
# Function for translation
|
183 |
+
@st.cache_data
|
184 |
+
def translate_text(text, source='auto', target='en'):
|
185 |
+
try:
|
186 |
+
if source != target:
|
187 |
+
text = GoogleTranslator(source=source, target=target).translate(text)
|
188 |
+
return text
|
189 |
+
|
190 |
+
except Exception as e:
|
191 |
+
logging.error(f"Translation error: {str(e)}")
|
192 |
+
return text
|
193 |
+
|
194 |
+
## ............................................... ##
|
195 |
+
# Function for Load and Transform Data
|
196 |
+
@st.cache_data
|
197 |
+
def selection_data(filename):
|
198 |
+
file_path = f"tweets-data/{filename}"
|
199 |
+
df = pd.read_csv(file_path, delimiter=";")
|
200 |
+
|
201 |
+
|
202 |
+
# Rename columns
|
203 |
+
column_mapping = {
|
204 |
+
'created_at': 'Created Date',
|
205 |
+
'user_id_str': 'User ID',
|
206 |
+
'username': 'Username',
|
207 |
+
'full_text': 'Tweet',
|
208 |
+
'tweet_url': 'Tweet URL',
|
209 |
+
'id_str': 'Tweet ID',
|
210 |
+
'conversation_id_str': 'Conversation ID',
|
211 |
+
'lang': 'App Language',
|
212 |
+
'quote_count': 'Quote Count',
|
213 |
+
'reply_count': 'Reply Count',
|
214 |
+
'retweet_count': 'Retweet Count',
|
215 |
+
'favorite_count': 'Favorite Count',
|
216 |
+
}
|
217 |
+
|
218 |
+
df = df.rename(columns=column_mapping)
|
219 |
+
|
220 |
+
# Add a new column for detected language
|
221 |
+
df['Detect Language'] = df['Tweet'].apply(lambda tweet: detect(tweet))
|
222 |
+
|
223 |
+
# Mapping language codes to country names
|
224 |
+
language_to_country = {
|
225 |
+
'af': 'South Africa',
|
226 |
+
'ar': 'Arabic',
|
227 |
+
'bg': 'Bulgaria',
|
228 |
+
'bn': 'Bangladesh',
|
229 |
+
'ca': 'Catalan',
|
230 |
+
'cs': 'Czech',
|
231 |
+
'cy': 'Welsh',
|
232 |
+
'da': 'Danish',
|
233 |
+
'de': 'German',
|
234 |
+
'el': 'Greek',
|
235 |
+
'en': 'English',
|
236 |
+
'es': 'Spanish',
|
237 |
+
'et': 'Estonian',
|
238 |
+
'fa': 'Persian',
|
239 |
+
'fi': 'Finnish',
|
240 |
+
'fr': 'French',
|
241 |
+
'gu': 'Gujarati',
|
242 |
+
'he': 'Hebrew',
|
243 |
+
'hi': 'Hindi',
|
244 |
+
'hr': 'Croatian',
|
245 |
+
'hu': 'Hungarian',
|
246 |
+
'id': 'Indonesian',
|
247 |
+
'it': 'Italian',
|
248 |
+
'ja': 'Japanese',
|
249 |
+
'kn': 'Kannada',
|
250 |
+
'ko': 'Korean',
|
251 |
+
'lt': 'Lithuanian',
|
252 |
+
'lv': 'Latvian',
|
253 |
+
'mk': 'Macedonian',
|
254 |
+
'ml': 'Malayalam',
|
255 |
+
'mr': 'Marathi',
|
256 |
+
'ne': 'Nepali',
|
257 |
+
'nl': 'Dutch',
|
258 |
+
'no': 'Norwegian',
|
259 |
+
'pa': 'Punjabi',
|
260 |
+
'pl': 'Polish',
|
261 |
+
'pt': 'Portuguese',
|
262 |
+
'ro': 'Romanian',
|
263 |
+
'ru': 'Russian',
|
264 |
+
'sk': 'Slovak',
|
265 |
+
'sl': 'Slovenian',
|
266 |
+
'so': 'Somali',
|
267 |
+
'sq': 'Albanian',
|
268 |
+
'sv': 'Swedish',
|
269 |
+
'sw': 'Swahili',
|
270 |
+
'ta': 'Tamil',
|
271 |
+
'te': 'Telugu',
|
272 |
+
'th': 'Thai',
|
273 |
+
'tl': 'Tagalog',
|
274 |
+
'tr': 'Turkish',
|
275 |
+
'uk': 'Ukrainian',
|
276 |
+
'ur': 'Urdu',
|
277 |
+
'vi': 'Vietnamese',
|
278 |
+
'zh-cn': 'Simplified Chinese',
|
279 |
+
'zh-tw': 'Traditional Chinese'
|
280 |
+
}
|
281 |
+
|
282 |
+
# Add 'Country' column to df
|
283 |
+
df['Language'] = df['Detect Language'].map(language_to_country)
|
284 |
+
|
285 |
+
# Sort columns
|
286 |
+
desired_columns = ['Created Date', 'User ID', 'Username', 'Tweet', 'Language', 'Detect Language', 'App Language', 'Tweet URL', 'Tweet ID', 'Conversation ID', 'Quote Count', 'Reply Count', 'Retweet Count', 'Favorite Count']
|
287 |
+
df = df[desired_columns]
|
288 |
+
|
289 |
+
# Set data types
|
290 |
+
data_types = {
|
291 |
+
'Created Date': 'datetime64[ns]',
|
292 |
+
'User ID': 'int64',
|
293 |
+
'Username': 'object',
|
294 |
+
'Tweet': 'object',
|
295 |
+
'Language': 'object',
|
296 |
+
'Detect Language': 'object',
|
297 |
+
'App Language': 'object',
|
298 |
+
'Tweet URL': 'object',
|
299 |
+
'Tweet ID': 'int64',
|
300 |
+
'Conversation ID': 'int64',
|
301 |
+
'Quote Count': 'int64',
|
302 |
+
'Reply Count': 'int64',
|
303 |
+
'Retweet Count': 'int64',
|
304 |
+
'Favorite Count': 'int64',
|
305 |
+
}
|
306 |
+
|
307 |
+
df = df.astype(data_types)
|
308 |
+
|
309 |
+
return df
|
310 |
+
|
311 |
+
## ............................................... ##
|
312 |
+
# Function to preprocess the data
|
313 |
+
@st.cache_data
|
314 |
+
def preprocessing_data(df):
|
315 |
+
# Remove duplicates
|
316 |
+
df = df.drop_duplicates(subset='Translation')
|
317 |
+
|
318 |
+
# Function to clean and preprocess text
|
319 |
+
def clean_text(text):
|
320 |
+
# Remove mentions (e.g., @username)
|
321 |
+
text = re.sub(r'@[\w]+', '', text)
|
322 |
+
|
323 |
+
# Remove URLs
|
324 |
+
text = re.sub(r'http\S+', '', text)
|
325 |
+
|
326 |
+
# Remove HTML tags
|
327 |
+
text = BeautifulSoup(text, 'html.parser').get_text()
|
328 |
+
|
329 |
+
# Convert to lowercase
|
330 |
+
text = text.lower()
|
331 |
+
|
332 |
+
# Remove non-alphanumeric characters
|
333 |
+
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
334 |
+
|
335 |
+
# Tokenize text
|
336 |
+
words = nltk.word_tokenize(text)
|
337 |
+
|
338 |
+
# Remove stopwords
|
339 |
+
stop_words = set(stopwords.words('english'))
|
340 |
+
words = [word for word in words if word not in stop_words]
|
341 |
+
|
342 |
+
# Lemmatize words
|
343 |
+
lemmatizer = WordNetLemmatizer()
|
344 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
345 |
+
|
346 |
+
return ' '.join(words)
|
347 |
+
|
348 |
+
# Apply the clean_text function to the "Translation" column
|
349 |
+
df['Cleaned Translation'] = df['Translation'].apply(clean_text)
|
350 |
+
|
351 |
+
return df
|
352 |
+
|
353 |
+
## ............................................... ##
|
354 |
+
# Function to create a Word Cloud
|
355 |
+
@st.cache_data
|
356 |
+
def create_wordcloud(df):
|
357 |
+
# Combine all text
|
358 |
+
text = ' '.join(df['Cleaned Translation'])
|
359 |
+
|
360 |
+
# Create a Word Cloud
|
361 |
+
wordcloud = WordCloud(width=700, height=400, max_words=50).generate(text)
|
362 |
+
|
363 |
+
# Convert the word cloud to an image
|
364 |
+
wordcloud_image = wordcloud.to_image()
|
365 |
+
|
366 |
+
# Display the Word Cloud using st.image
|
367 |
+
st.write("word Cloud by Tweets")
|
368 |
+
st.image(wordcloud_image, use_column_width=True)
|
369 |
+
|
370 |
+
## ............................................... ##
|
371 |
+
# IMPORTANT: Cache the conversion to prevent computation on every rerun
|
372 |
+
@st.cache_data
|
373 |
+
def convert_df(df):
|
374 |
+
return df.to_csv().encode('utf-8')
|
375 |
+
|
376 |
+
## ............................................... ##
|
377 |
+
# Set up logging
|
378 |
+
logging.basicConfig(filename='tweet_harvest.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
379 |
+
|
380 |
+
## ............................................... ##
|
381 |
+
with st.container():
|
382 |
+
# Input search parameters
|
383 |
+
search_keyword = st.text_input("Enter search keyword", "Jakarta",)
|
384 |
+
|
385 |
+
col1, col2 = st.columns(2)
|
386 |
+
|
387 |
+
with col1:
|
388 |
+
from_date = st.date_input('From Date :', pd.to_datetime('2023-01-01'))
|
389 |
+
to_date = st.date_input('To Date :', pd.to_datetime('2023-12-01'))
|
390 |
+
with col2:
|
391 |
+
limit = st.number_input("Enter limit", min_value=10, value=10, max_value=100)
|
392 |
+
delay = st.number_input("Enter delay in seconds", min_value=1, value=3)
|
393 |
+
|
394 |
+
token = st.text_input("Enter your X Auth Token", type="password")
|
395 |
+
|
396 |
+
## ............................................... ##
|
397 |
+
with st.container():
|
398 |
+
col1, col2 = st.columns(2)
|
399 |
+
|
400 |
+
with col1:
|
401 |
+
# Checkbox options for different processing steps
|
402 |
+
include_translation = st.checkbox("Include Translation", value=False)
|
403 |
+
include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=False)
|
404 |
+
with col2:
|
405 |
+
include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=False)
|
406 |
+
include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=False)
|
407 |
+
|
408 |
+
## ............................................... ##
|
409 |
+
# Initialize to install node.js
|
410 |
+
install_nodejs()
|
411 |
+
|
412 |
+
# Initialize model and tokenizer
|
413 |
+
model, tokenizer = get_models_and_tokenizers()
|
414 |
+
|
415 |
+
# Create a variable to track whether the data has been processed
|
416 |
+
data_processed = False
|
417 |
+
|
418 |
+
## ............................................... ##
|
419 |
+
# Create a button to trigger tweet-harvest
|
420 |
+
with st.container():
|
421 |
+
if st.button("Run it"):
|
422 |
+
# Format the dates as "DD-MM-YYYY"
|
423 |
+
from_date = from_date.strftime("%d-%m-%Y")
|
424 |
+
to_date = to_date.strftime("%d-%m-%Y")
|
425 |
+
|
426 |
+
filename = 'tweets_data.csv'
|
427 |
+
|
428 |
+
run_X_scrapping(search_keyword,from_date,to_date,limit,delay,token,filename)
|
429 |
+
|
430 |
+
df = selection_data(filename)
|
431 |
+
|
432 |
+
# Conditionally apply translation function to the 'Translation' column
|
433 |
+
if include_translation:
|
434 |
+
df['Translation'] = df.apply(lambda row: translate_text((row['Tweet']), source=row['Detect Language'], target='en'), axis=1)
|
435 |
+
df = preprocessing_data(df)
|
436 |
+
|
437 |
+
# Conditionally apply sentiment analysis function to the 'Translation' column
|
438 |
+
if include_sentiment_analysis:
|
439 |
+
df[['Fake Check', 'Sentiment Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer))).apply(lambda x: x.str.title())
|
440 |
+
|
441 |
+
# Conditionally apply VADER sentiment analysis to the 'Translation' column
|
442 |
+
if include_sentiment_vader:
|
443 |
+
df['Sentiment VADER'] = df['Translation'].apply(analyze_sentiment_vader)
|
444 |
+
|
445 |
+
# Conditionally apply TextBlob sentiment analysis to the 'Translation' column
|
446 |
+
if include_sentiment_textblob:
|
447 |
+
df['Sentiment TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
|
448 |
+
|
449 |
+
# Set data_processed to True when the data has been successfully processed
|
450 |
+
data_processed = True
|
451 |
+
|
452 |
+
## ............................................... ##
|
453 |
+
# Add a button to download the data as a CSV file
|
454 |
+
if data_processed:
|
455 |
+
st.markdown("### Download Processed Data as CSV")
|
456 |
+
st.write("Click the button below to download the processed data as a CSV file.")
|
457 |
+
csv_data = convert_df(df)
|
458 |
+
|
459 |
+
# Create a downloadable link
|
460 |
+
st.download_button(
|
461 |
+
label="Download data as CSV",
|
462 |
+
data=csv_data,
|
463 |
+
file_name='processed_data.csv',
|
464 |
+
mime='text/csv',
|
465 |
+
)
|
466 |
+
|
467 |
+
with st.expander("See Table"):
|
468 |
+
## ............................................... ##
|
469 |
+
# Display processed data
|
470 |
+
st.dataframe(df)
|
471 |
+
|
472 |
+
# Display processed data
|
473 |
+
with st.expander("See EDA"):
|
474 |
+
## ............................................... ##
|
475 |
+
# Create a Streamlit app
|
476 |
+
st.subheader("Tweet Data Visualization")
|
477 |
+
|
478 |
+
col1, col2 = st.columns(2)
|
479 |
+
with col1:
|
480 |
+
## ............................................... ##
|
481 |
+
# Create a new column with a count of 1 for each tweet
|
482 |
+
df_date = pd.DataFrame(df['Created Date'])
|
483 |
+
df_date['Tweet Count'] = 1
|
484 |
+
|
485 |
+
# Resample the data per second and calculate the count
|
486 |
+
data_resampled = df_date.resample('S', on='Created Date')['Tweet Count'].count().reset_index()
|
487 |
+
|
488 |
+
# Create a time series plot with custom styling
|
489 |
+
fig = px.line(data_resampled, x='Created Date', y='Tweet Count', title='Tweet Counts Over Time')
|
490 |
+
fig.update_xaxes(title_text='Time')
|
491 |
+
fig.update_yaxes(title_text='Tweet Count')
|
492 |
+
fig.update_layout(xaxis_rangeslider_visible=True)
|
493 |
+
|
494 |
+
# Specify custom dimensions for the chart
|
495 |
+
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
|
496 |
+
|
497 |
+
## ............................................... ##
|
498 |
+
# Group by Sentiment columns and get the count
|
499 |
+
sentiment_counts = df[['Sentiment Distilbert', 'Sentiment VADER', 'Sentiment TextBlob']].apply(lambda x: x.value_counts()).T
|
500 |
+
|
501 |
+
# Reset index to get Sentiment as a column
|
502 |
+
sentiment_counts = sentiment_counts.reset_index()
|
503 |
+
|
504 |
+
# Melt the DataFrame for easier plotting
|
505 |
+
sentiment_counts = pd.melt(sentiment_counts, id_vars='index', var_name='Sentiment', value_name='Count')
|
506 |
+
|
507 |
+
# Create the plot
|
508 |
+
fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='index', barmode='group', title='Total Tweet per Sentiment')
|
509 |
+
|
510 |
+
# Specify custom dimensions for the chart
|
511 |
+
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
|
512 |
+
|
513 |
+
with col2:
|
514 |
+
## ............................................... ##
|
515 |
+
# Create a DataFrame to count the number of tweets by language
|
516 |
+
language_counts = df['Language'].value_counts().reset_index()
|
517 |
+
language_counts.columns = ['Language', 'Tweet Count']
|
518 |
+
|
519 |
+
# Create an attractive Plotly bar chart
|
520 |
+
fig = px.bar(language_counts, x='Language', y='Tweet Count', text='Tweet Count', title='Total Tweet by Language')
|
521 |
+
fig.update_xaxes(title_text='Language')
|
522 |
+
fig.update_yaxes(title_text='Total Tweet')
|
523 |
+
|
524 |
+
# Specify custom dimensions for the chart
|
525 |
+
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
|
526 |
+
|
527 |
+
## ............................................... ##
|
528 |
+
# Create wordcloud
|
529 |
+
create_wordcloud(df)
|
530 |
+
|
531 |
+
## ............................................... ##
|
532 |
+
# Show dataset information
|
533 |
+
pr = ProfileReport(df)
|
534 |
+
st_profile_report(pr)
|