Spaces:

MarMont
/

MARITESS

Sleeping

App Files Files Community

MARITESS / appv1.py

MarMont

more debugging

49e4936 about 1 year ago

raw

history blame contribute delete

18.1 kB

	import pandas as pd
	import tweepy
	import re
	import emoji
	import spacy
	import gensim
	import json
	import string

	from spacy.tokenizer import Tokenizer
	from gensim.parsing.preprocessing import STOPWORDS as SW
	from wordcloud import STOPWORDS

	from gensim.corpora import Dictionary
	from gensim.models.coherencemodel import CoherenceModel
	from pprint import pprint

	import numpy as np
	import tqdm

	from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric

	import torch
	from transformers import T5ForConditionalGeneration,T5Tokenizer
	from googletrans import Translator

	from bertopic import BERTopic
	from umap import UMAP
	from sklearn.feature_extraction.text import CountVectorizer

	from operator import itemgetter

	import gradio as gr

	global df
	bearer_token = 'AAAAAAAAAAAAAAAAAAAAACEigwEAAAAACoP8KHJYLOKCL4OyB9LEPV00VB0%3DmyeDROUvw4uipHwvbPPfnTuY0M9ORrLuXrMvcByqZhwo3SUc4F'
	client = tweepy.Client(bearer_token=bearer_token)
	nlp = spacy.load('en_core_web_lg')
	print('hi')

	def scrape(keywords):
	query = keywords + ' (lang:en OR lang:tl) -is:retweet'
	max_results = 100
	tweet_fields=['geo', 'id', 'lang', 'created_at']
	expansions=['geo.place_id']
	place_fields = ['contained_within', 'country', 'country_code', 'full_name', 'geo', 'id', 'name', 'place_type']

	response = client.search_recent_tweets(
	query=query,
	max_results=max_results,
	tweet_fields=tweet_fields,
	expansions=expansions,
	place_fields=place_fields
	)

	tweets = []
	for x in response[0]:
	tweets.append(str(x))

	place_data = response[1]

	df = pd.DataFrame(tweets, columns=['tweet'])

	return place_data

	def get_example(dataset):
	df = pd.read_csv(dataset + '.csv')
	return df

	def give_emoji_free_text(text):
	"""
	Removes emoji's from tweets
	Accepts:
	Text (tweets)
	Returns:
	Text (emoji free tweets)
	"""
	emoji_list = [c for c in text if c in emoji.EMOJI_DATA]
	clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
	return clean_text

	def url_free_text(text):
	'''
	Cleans text from urls
	'''
	text = re.sub(r'http\S+', '', text)
	return text

	def get_lemmas(text):
	'''Used to lemmatize the processed tweets'''
	lemmas = []

	doc = nlp(text)

	for token in doc:
	if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
	lemmas.append(token.lemma_)

	return lemmas

	# Tokenizer function
	def tokenize(text):
	"""
	Parses a string into a list of semantic units (words)
	Args:
	text (str): The string that the function will tokenize.
	Returns:
	list: tokens parsed out
	"""
	# Removing url's
	pattern = r"http\S+"

	tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
	tokens = re.sub('[^a-zA-Z 0-9]', '', text)
	tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
	tokens = re.sub('\w\d\w', '', text) # Remove words containing numbers
	# tokens = re.sub('@!$*', '', text) # Remove @ ! $
	tokens = tokens.strip(',') # TESTING THIS LINE
	tokens = tokens.strip('?') # TESTING THIS LINE
	tokens = tokens.strip('!') # TESTING THIS LINE
	tokens = tokens.strip("'") # TESTING THIS LINE
	tokens = tokens.strip(".") # TESTING THIS LINE

	tokens = tokens.lower().split() # Make text lowercase and split it

	return tokens


	def cleaning(df):
	df.rename(columns = {'tweet':'original_tweets'}, inplace = True)

	# Apply the function above and get tweets free of emoji's
	call_emoji_free = lambda x: give_emoji_free_text(x)

	# Apply `call_emoji_free` which calls the function to remove all emoji's
	df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)

	#Create a new column with url free tweets
	df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)



	f = open('stopwords-tl.json')
	tlStopwords = json.loads(f.read())
	stopwords = set(STOPWORDS)
	stopwords.update(tlStopwords)
	stopwords.update(['na', 'sa', 'ko', 'ako', 'ng', 'mga', 'ba', 'ka', 'yung', 'lang', 'di', 'mo', 'kasi'])

	# Tokenizer
	tokenizer = Tokenizer(nlp.vocab)


	# Custom stopwords
	custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']


	# Customize stop words by adding to the default list
	STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

	# ALL_STOP_WORDS = spacy + gensim + wordcloud
	ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


	tokens = []
	STOP_WORDS.update(stopwords)

	for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
	doc_tokens = []
	for token in doc:
	if token.text.lower() not in STOP_WORDS:
	doc_tokens.append(token.text.lower())
	tokens.append(doc_tokens)

	# Makes tokens column
	df['tokens'] = tokens

	# Make tokens a string again
	df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

	df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

	# Make lemmas a string again
	df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]

	# Apply tokenizer
	df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

	def split_corpus(corpus, n):
	for i in range(0, len(corpus), n):
	corpus_split = corpus
	yield corpus_split[i:i + n]

	def compute_coherence_values_base_lda(dictionary, corpus, texts, limit, coherence, start=2, step=1):
	coherence_values = []
	model_list = []
	for num_topics in range(start, limit, step):
	model = gensim.models.ldamodel.LdaModel(corpus=corpus,
	num_topics=num_topics,
	random_state=100,
	chunksize=200,
	passes=10,
	per_word_topics=True,
	id2word=id2word)
	model_list.append(model)
	coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence)
	coherence_values.append(coherencemodel.get_coherence())

	return model_list, coherence_values

	def base_lda():
	# Create a id2word dictionary
	global id2word
	id2word = Dictionary(df['lemma_tokens'])

	# Filtering Extremes
	id2word.filter_extremes(no_below=2, no_above=.99)

	# Creating a corpus object
	global corpus
	corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]
	global corpus_og
	corpus_og = [id2word.doc2bow(d) for d in df['lemma_tokens']]

	corpus_split = corpus
	split_corpus(corpus_split, 5)

	global coherence
	coherence = 'c_v'

	coherence_averages = [0] * 8
	for i in range(5):
	training_corpus = corpus_split
	training_corpus.remove(training_corpus[i])
	print(training_corpus[i])
	model_list, coherence_values = compute_coherence_values_base_lda(dictionary=id2word, corpus=training_corpus,
	texts=df['lemma_tokens'],
	start=2,
	limit=10,
	step=1,
	coherence=coherence)
	for j in range(len(coherence_values)):
	coherence_averages[j] += coherence_values[j]

	limit = 10; start = 2; step = 1;
	x = range(start, limit, step)

	coherence_averages = [x / 5 for x in coherence_averages]

	if coherence == 'c_v':
	k_max = max(coherence_averages)
	else:
	k_max = min(coherence_averages, key=abs)

	global num_topics
	num_topics = coherence_averages.index(k_max) + 2

	def compute_coherence_values2(corpus, dictionary, k, a, b):
	lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
	id2word=id2word,
	num_topics=num_topics,
	random_state=100,
	chunksize=200,
	passes=10,
	alpha=a,
	eta=b,
	per_word_topics=True)
	coherence_model_lda = CoherenceModel(model=lda_model,
	texts=df['lemma_tokens'],
	dictionary=id2word,
	coherence='c_v')

	return coherence_model_lda.get_coherence()

	def hyperparameter_optimization():
	grid = {}
	grid['Validation_Set'] = {}

	min_topics = 1
	max_topics = 10
	step_size = 1
	topics_range = range(min_topics, max_topics, step_size)

	alpha = [0.05, 0.1, 0.5, 1, 5, 10]
	# alpha.append('symmetric')
	# alpha.append('asymmetric')

	beta = [0.05, 0.1, 0.5, 1, 5, 10]
	# beta.append('symmetric')

	num_of_docs = len(corpus_og)
	corpus_sets = [gensim.utils.ClippedCorpus(corpus_og, int(num_of_docs*0.75)),
	corpus_og]
	corpus_title = ['75% Corpus', '100% Corpus']
	model_results = {'Validation_Set': [],
	'Alpha': [],
	'Beta': [],
	'Coherence': []
	}
	if 1 == 1:
	pbar = tqdm.tqdm(total=540)

	for i in range(len(corpus_sets)):
	for a in alpha:
	for b in beta:
	cv = compute_coherence_values2(corpus=corpus_sets[i],
	dictionary=id2word,
	k=num_topics,
	a=a,
	b=b)
	model_results['Validation_Set'].append(corpus_title[i])
	model_results['Alpha'].append(a)
	model_results['Beta'].append(b)
	model_results['Coherence'].append(cv)

	pbar.update(1)
	pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False)
	pbar.close()

	params_df = pd.read_csv('lda_tuning_results_new.csv')
	params_df = params_df[params_df.Validation_Set == '75% Corpus']
	params_df.reset_index(inplace=True)
	params_df = params_df.replace(np.inf, -np.inf)
	max_params = params_df.loc[params_df['Coherence'].idxmax()]
	max_coherence = max_params['Coherence']
	max_alpha = max_params['Alpha']
	max_beta = max_params['Beta']
	max_validation_set = max_params['Validation_Set']

	global lda_model_final
	lda_model_final = gensim.models.ldamodel.LdaModel(corpus=corpus_og,
	id2word=id2word,
	num_topics=num_topics,
	random_state=100,
	chunksize=200,
	passes=10,
	alpha=max_alpha,
	eta=max_beta,
	per_word_topics=True)

	coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word,
	coherence='c_v')
	coherence_lda = coherence_model_lda.get_coherence()

	return coherence_lda

	def assignMaxTopic(l):
	maxTopic = max(l,key=itemgetter(1))[0]
	return maxTopic

	def assignTopic(l):
	topics = []
	for x in l:
	topics.append(x[0])

	def topic_assignment(df):
	lda_topics = lda_model_final.show_topics(num_words=10)

	topics = []
	filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

	for topic in lda_topics:
	topics.append(preprocess_string(topic[1], filters))

	df['topic'] = [sorted(lda_model_final[corpus_og][text][0]) for text in range(len(df['original_tweets']))]

	df = df[df['topic'].map(lambda d: len(d)) > 0]
	df['max_topic'] = df['topic'].map(lambda row: assignMaxTopic(row))

	global topic_clusters
	topic_clusters = []
	for i in range(num_topics):
	topic_clusters.append(df[df['max_topic'].isin(([i]))])
	topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist()

	def get_topic_value(row, i):
	if len(row) == 1:
	return row[0][1]
	else:
	try:
	return row[i][1]
	except Exception as e:
	print(e)

	def reprsentative_tweets():
	global top_tweets
	top_tweets = []
	for i in range(len(topic_clusters)):
	tweets = df.loc[df['max_topic'] == i]
	tweets['topic'] = tweets['topic'].apply(lambda x: get_topic_value(x, i))
	# tweets['topic'] = [row[i][1] for row in tweets['topic']]
	tweets_sorted = tweets.sort_values('topic', ascending=False)
	tweets_sorted.drop_duplicates(subset=['original_tweets'])
	rep_tweets = tweets_sorted['original_tweets']
	rep_tweets = [*set(rep_tweets)]
	top_tweets.append(rep_tweets[:5])
	# print('Topic ', i)
	# print(rep_tweets[:5])
	return top_tweets

	def topic_summarization(topic_groups):
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	model = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
	tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
	model = model.to(device)
	translator = Translator()

	headlines = []
	for i in range(len(topic_groups)):
	tweets = " ".join(topic_groups[i])
	# print(tweets)
	out = translator.translate(tweets, dest='en')
	text = out.text
	# print(tweets)

	max_len = 256

	encoding = tokenizer.encode_plus(text, return_tensors = "pt")
	input_ids = encoding["input_ids"].to(device)
	attention_masks = encoding["attention_mask"].to(device)

	beam_outputs = model.generate(
	input_ids = input_ids,
	attention_mask = attention_masks,
	max_length = 64,
	num_beams = 3,
	early_stopping = True,
	)

	result = tokenizer.decode(beam_outputs[0])
	headlines += "Topic " + str(i) + " " + result

	return headlines

	def compute_coherence_value_bertopic(topic_model):
	topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics))-1)]
	coherence_model = CoherenceModel(topics=topic_words,
	texts=df['lemma_tokens'],
	corpus=corpus,
	dictionary=id2word,
	coherence=coherence)
	coherence_score = coherence_model.get_coherence()

	return coherence_score

	def base_bertopic():
	df['lemma_tokens_string'] = df['lemma_tokens'].apply(lambda x: ' '.join(x))
	global id2word
	id2word = Dictionary(df['lemma_tokens'])
	global corpus
	corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

	global umap_model
	umap_model = UMAP(n_neighbors=15,
	n_components=5,
	min_dist=0.0,
	metric='cosine',
	random_state=100)

	base_topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True)

	topics, probabilities = base_topic_model.fit_transform(df['lemma_tokens_string'])

	try:
	print(compute_coherence_value_bertopic(base_topic_model))
	except:
	print('Unable to generate meaningful topics (Base BERTopic model)')

	def optimized_bertopic():
	vectorizer_model = CountVectorizer(max_features=1_000, stop_words="english")
	optimized_topic_model = BERTopic(umap_model=umap_model,
	language="multilingual",
	n_gram_range=(1, 3),
	vectorizer_model=vectorizer_model,
	calculate_probabilities=True)

	topics, probabilities = optimized_topic_model.fit_transform(df['lemma_tokens_string'])

	try:
	print(compute_coherence_value_bertopic(optimized_topic_model))
	except:
	print('Unable to generate meaningful topics, base BERTopic model if possible')

	rep_docs = optimized_topic_model.representative_docs_

	global top_tweets
	top_tweets = []

	for topic in rep_docs:
	if topic == -1:
	print('test')
	continue
	topic_docs = rep_docs.get(topic)

	tweets = []
	for doc in topic_docs:
	index = df.isin([doc]).any(axis=1).idxmax()
	# print(index)
	tweets.append(df.loc[index, 'original_tweets'])
	print(tweets)
	top_tweets.append(tweets)

	global examples

	def main(dataset, model):
	global df
	examples = [ "katip,katipunan",
	"bgc,bonifacio global city",
	"pobla,poblacion",
	"cubao",
	"taft"
	]
	keyword_list = dataset.split(',')
	if len(keyword_list) > 1:
	keywords = '(' + ' OR '.join(keyword_list) + ')'
	else:
	keywords = keyword_list[0]
	if dataset in examples:
	df = get_example(keywords)
	place_data = 'test'
	else:
	print(dataset)
	place_data = str(scrape(keyword_list))
	print(df)
	cleaning(df)

	print(df)
	if model == 'LDA':
	base_lda()
	coherence = hyperparameter_optimization()
	topic_assignment(df)
	top_tweets = reprsentative_tweets()
	else:
	base_bertopic()
	optimized_bertopic()

	headlines = topic_summarization(top_tweets)
	headlines = '\n'.join(str(h) for h in headlines)



	return place_data, headlines


	iface = gr.Interface(fn=main,
	inputs=[gr.Dropdown(["katip,katipunan",
	"bgc,bonifacio global city",
	"cubao",
	"taft",
	"pobla,poblacion"],
	label="Dataset"),
	gr.Dropdown(["LDA",
	"BERTopic"],
	label="Model")
	],
	# examples=examples,
	outputs=["text",
	"text"]
	)
	iface.launch()