Spaces:

MarMont
/

MARITESS

Sleeping

App Files Files Community

MARITESS / app_old.py

MarMont

new data and app

56685c8 about 1 year ago

raw

history blame contribute delete

13.4 kB

	# Required Libraries

	#Base and Cleaning
	import json
	import requests
	import pandas as pd
	import numpy as np
	import emoji
	import regex
	import re
	import string
	from collections import Counter
	import tqdm
	from operator import itemgetter

	#Visualizations
	import plotly.express as px
	import seaborn as sns
	import matplotlib.pyplot as plt
	import pyLDAvis.gensim
	import chart_studio
	import chart_studio.plotly as py
	import chart_studio.tools as tls

	#Natural Language Processing (NLP)
	import spacy
	import gensim
	import json
	from spacy.tokenizer import Tokenizer
	from gensim.corpora import Dictionary
	from gensim.models.ldamulticore import LdaMulticore
	from gensim.models.coherencemodel import CoherenceModel
	from gensim.parsing.preprocessing import STOPWORDS as SW
	from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
	from sklearn.model_selection import GridSearchCV
	from pprint import pprint
	from wordcloud import STOPWORDS
	from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric

	import gradio as gr

	def give_emoji_free_text(text):
	"""
	Removes emoji's from tweets
	Accepts:
	Text (tweets)
	Returns:
	Text (emoji free tweets)
	"""
	emoji_list = [c for c in text if c in emoji.EMOJI_DATA]
	clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
	return clean_text

	def url_free_text(text):
	'''
	Cleans text from urls
	'''
	text = re.sub(r'http\S+', '', text)
	return text

	# Tokenizer function
	def tokenize(text):
	"""
	Parses a string into a list of semantic units (words)
	Args:
	text (str): The string that the function will tokenize.
	Returns:
	list: tokens parsed out
	"""
	# Removing url's
	pattern = r"http\S+"

	tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
	tokens = re.sub('[^a-zA-Z 0-9]', '', text)
	tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
	tokens = re.sub('\w\d\w', '', text) # Remove words containing numbers
	# tokens = re.sub('@!$*', '', text) # Remove @ ! $
	tokens = tokens.strip(',') # TESTING THIS LINE
	tokens = tokens.strip('?') # TESTING THIS LINE
	tokens = tokens.strip('!') # TESTING THIS LINE
	tokens = tokens.strip("'") # TESTING THIS LINE
	tokens = tokens.strip(".") # TESTING THIS LINE

	tokens = tokens.lower().split() # Make text lowercase and split it

	return tokens

	def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
	coherence_values = []
	model_list = []
	for num_topics in range(start, limit, step):
	model = gensim.models.ldamodel.LdaModel(corpus=corpus,
	num_topics=num_topics,
	random_state=100,
	chunksize=200,
	passes=10,
	per_word_topics=True,
	id2word=id2word)
	model_list.append(model)
	coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
	coherence_values.append(coherencemodel.get_coherence())

	return model_list, coherence_values

	def compute_coherence_values2(corpus, dictionary, k, a, b):
	lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
	id2word=id2word,
	num_topics=num_topics,
	random_state=100,
	chunksize=200,
	passes=10,
	alpha=a,
	eta=b,
	per_word_topics=True)
	coherence_model_lda = CoherenceModel(model=lda_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v')

	return coherence_model_lda.get_coherence()

	def assignTopic(l):
	maxTopic = max(l,key=itemgetter(1))[0]
	return maxTopic

	def get_topic_value(row, i):
	if len(row) == 1:
	return row[0][1]
	else:
	return row[i][1]

	def dataframeProcessing(dataset):
	# Opening JSON file
	f = open('stopwords-tl.json')
	tlStopwords = json.loads(f.read())
	stopwords = set(STOPWORDS)
	stopwords.update(tlStopwords)
	stopwords.update(['na', 'sa', 'ko', 'ako', 'ng', 'mga', 'ba', 'ka', 'yung', 'lang', 'di', 'mo', 'kasi'])

	global df
	df = pd.read_csv(dataset + '.csv')
	df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
	df = df.apply(lambda row: row[df['language'].isin(['en'])])
	df.reset_index(inplace=True)

	# Apply the function above and get tweets free of emoji's
	call_emoji_free = lambda x: give_emoji_free_text(x)

	# Apply `call_emoji_free` which calls the function to remove all emoji's
	df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)

	#Create a new column with url free tweets
	df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)

	# Load spacy
	# Make sure to restart the runtime after running installations and libraries tab
	nlp = spacy.load('en_core_web_lg')

	# Tokenizer
	tokenizer = Tokenizer(nlp.vocab)


	# Custom stopwords
	custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']


	# Customize stop words by adding to the default list
	STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

	# ALL_STOP_WORDS = spacy + gensim + wordcloud
	ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


	tokens = []
	STOP_WORDS.update(stopwords)

	for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
	doc_tokens = []
	for token in doc:
	if token.text.lower() not in STOP_WORDS:
	doc_tokens.append(token.text.lower())
	tokens.append(doc_tokens)

	# Makes tokens column
	df['tokens'] = tokens

	# Make tokens a string again
	df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

	def get_lemmas(text):
	'''Used to lemmatize the processed tweets'''
	lemmas = []

	doc = nlp(text)

	# Something goes here :P
	for token in doc:
	if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
	lemmas.append(token.lemma_)

	return lemmas

	df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

	# Make lemmas a string again
	df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]

	# Apply tokenizer
	df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

	# Create a id2word dictionary
	global id2word
	id2word = Dictionary(df['lemma_tokens'])

	# Filtering Extremes
	id2word.filter_extremes(no_below=2, no_above=.99)
	print(len(id2word))

	# Creating a corpus object
	corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

	lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
	id2word=id2word,
	num_topics=5,
	random_state=100,
	chunksize=200,
	passes=10,
	per_word_topics=True)

	pprint(lda_model.print_topics())
	doc_lda = lda_model[corpus]

	coherence_model_lda = CoherenceModel(model=lda_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v')
	coherence_lda = coherence_model_lda.get_coherence()

	model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
	texts=df['lemma_tokens'],
	start=2,
	limit=10,
	step=1)

	k_max = max(coherence_values)

	global num_topics
	num_topics = coherence_values.index(k_max) + 2

	lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
	id2word=id2word,
	num_topics=num_topics,
	random_state=100,
	chunksize=200,
	passes=10,
	per_word_topics=True)

	grid = {}
	grid['Validation_Set'] = {}

	alpha = [0.05, 0.1, 0.5, 1, 5, 10]

	beta = [0.05, 0.1, 0.5, 1, 5, 10]

	num_of_docs = len(corpus)
	corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
	corpus]
	corpus_title = ['75% Corpus', '100% Corpus']
	model_results = {'Validation_Set': [],
	'Alpha': [],
	'Beta': [],
	'Coherence': []
	}
	if 1 == 1:
	pbar = tqdm.tqdm(total=540)

	for i in range(len(corpus_sets)):
	for a in alpha:
	for b in beta:
	cv = compute_coherence_values2(corpus=corpus_sets[i], dictionary=id2word, k=num_topics, a=a, b=b)
	model_results['Validation_Set'].append(corpus_title[i])
	model_results['Alpha'].append(a)
	model_results['Beta'].append(b)
	model_results['Coherence'].append(cv)

	pbar.update(1)
	pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False)
	pbar.close()

	params_df = pd.read_csv('lda_tuning_results_new.csv')
	params_df = params_df[params_df.Validation_Set == '100% Corpus']
	params_df.reset_index(inplace=True)

	max_params = params_df.loc[params_df['Coherence'].idxmax()]
	max_coherence = max_params['Coherence']
	max_alpha = max_params['Alpha']
	max_beta = max_params['Beta']

	lda_model_final = gensim.models.ldamodel.LdaModel(corpus=corpus,
	id2word=id2word,
	num_topics=7,
	random_state=100,
	chunksize=200,
	passes=10,
	alpha=max_alpha,
	eta=max_beta,
	per_word_topics=True)

	coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word,
	coherence='c_v')
	coherence_lda = coherence_model_lda.get_coherence()

	lda_topics = lda_model_final.show_topics(num_words=10)

	topics = []
	filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]
	lda_topics_string = ''

	for topic in lda_topics:
	print(topic)
	lda_topics_string += 'Topic ' + str(topic[0]) + '\n' + str(topic[1]) + '\n\n'
	topics.append(preprocess_string(topic[1], filters))

	df['topic'] = [sorted(lda_model_final[corpus][text][0]) for text in range(len(df['original_tweets']))]

	def sort_topics(l):
	return(sorted(l, key=lambda x: x[1], reverse=True))

	df['topic'] = df['topic'].apply(sort_topics)
	df['topic_string'] = df['topic'].astype(str)


	df = df[df['topic'].map(lambda d: len(d)) > 0]
	df['topic'][0]

	df['max_topic'] = df['topic'].map(lambda row: assignTopic(row))

	topic_clusters = []
	for i in range(num_topics):
	topic_clusters.append(df[df['max_topic'].isin(([i]))])
	topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist()

	for i in range(len(topic_clusters)):
	tweets = df.loc[df['max_topic'] == i]
	tweets['topic'] = tweets['topic'].apply(lambda x: get_topic_value(x, i))
	# tweets['topic'] = [row[i][1] for row in tweets['topic']]
	tweets_sorted = tweets.sort_values('topic', ascending=False)
	tweets_sorted.drop_duplicates(subset=['original_tweets'])
	rep_tweets = tweets_sorted['original_tweets']
	rep_tweets = [*set(rep_tweets)]
	print('Topic ', i)
	print(rep_tweets[:5])

	output_df = df[['topic_string', 'original_tweets']].copy()

	return lda_topics_string, output_df

	def greet(name):
	return "Hello " + name + "!!"

	iface = gr.Interface(fn=dataframeProcessing,
	inputs=gr.Dropdown(["katip-december",
	"katipunan-december",
	"bgc-december",
	"bonifacio global city-december"],
	label="Dataset"),
	outputs=["text",
	gr.Dataframe(headers=['topic_string', 'original_tweets'])])
	iface.launch()