Spaces:

marcelomoreno26
/

Whatsapp-Chat-Summarizer-and-Analysis

Runtime error

App Files Files Community

Whatsapp-Chat-Summarizer-and-Analysis / preprocessor.py

marcelomoreno26

Update preprocessor.py

cd28c0f verified 7 months ago

raw

history blame

3.97 kB

	import pandas as pd
	import zipfile
	import re
	from io import BytesIO


	def detect_file_type(file_path):
	type = file_path[-3:]
	if type in ["txt","zip"]:
	return type
	else:
	return "unknown"

	def preprocess_whatsapp_messages(file_path, file_type):
	"""
	Preprocesses the Whatsapp messages zip file into a Pandas Dataframe, all messages in one day go
	to a row and a timestamp is added.

	Args:
	file_path (str): Location of the file (zip or txt) of the conversation.

	Returns:
	str: Dataframe
	"""

	# Load the zip file and extract text data
	print(file_type)
	if file_type == "zip":
	with zipfile.ZipFile(file_path, 'r') as z:
	file_name = z.namelist()[0]
	with z.open(file_name) as file:
	text_data = file.read().decode('utf-8')
	else:
	text_data = BytesIO(file_path.getvalue()).read().decode('utf-8')


	# Split the text data into lines
	lines = text_data.strip().split('\n')

	# Create a DataFrame
	df = pd.DataFrame(lines, columns=['message'])

	# Process each line to separate timestamp and text
	df[['timestamp', 'text']] = df['message'].str.split(']', n=1, expand=True)
	df['timestamp'] = df['timestamp'].str.strip('[')

	# Handle cases where the split might not work (e.g., missing ']' in a line)
	df.dropna(subset=['timestamp', 'text'], inplace=True)

	# Convert timestamp to datetime and remove the time, keeping only the date
	df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%y, %H:%M:%S', errors='coerce').dt.date

	# Drop rows where the timestamp conversion failed (which results in NaT)
	df.dropna(subset=['timestamp'], inplace=True)

	# Remove initial WhatsApp system messages in English and Spanish
	filter_text_en = "Your messages and calls are end-to-end encrypted"
	filter_text_es = "Los mensajes y las llamadas están cifrados de extremo a extremo"
	df = df[~df['text'].str.contains(filter_text_en, na=False)]
	df = df[~df['text'].str.contains(filter_text_es, na=False)]

	# Additional preprocessing steps:
	# Remove URLs and convert text to lowercase
	df['text'] = df['text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x)) # Remove URLs
	df['text'] = df['text'].apply(lambda x: x.lower()) # Convert text to lowercase

	# Remove emojis, images, stickers, documents while preserving colons after sender names
	df['text'] = df['text'].apply(lambda x: re.sub(r'(?<!\w)(:\s\|\s:\s\|\s:)', '', x)) # Remove colons that are not part of sender's name
	df['text'] = df['text'].apply(lambda x: re.sub(r'\[image omitted\]', '', x)) # Remove images
	df['text'] = df['text'].apply(lambda x: re.sub(r'\[sticker omitted\]', '', x)) # Remove stickers
	df['text'] = df['text'].apply(lambda x: re.sub(r'\[document omitted\]', '', x)) # Remove documents
	df['text'] = df['text'].apply(lambda x: re.sub(r'<se editó este mensaje.>', '', x)) # Remove editing function (new Whatsapp addition) in Spanish
	df['text'] = df['text'].apply(lambda x: re.sub(r'<this message was edited.>', '', x)) # Remove editing function (new Whatsapp addition) in English I AM GUESSING IDk

	# Group by date and concatenate all messages from the same date
	df = df.groupby('timestamp')['text'].apply(lambda x: '\n'.join(x)).reset_index()
	df.columns = ['date', 'text']
	df['date'] = pd.to_datetime(df['date'])
	df['text'] = df['text'].astype(str)

	return df

	def get_dated_input(data, selected_date):
	'''
	The Pandas dataframe is processed and the text is extracted.
	:param data:
	:param selected_date:
	:return:
	'''
	selected_date = pd.to_datetime(selected_date)
	data_for_model = data[data['date'].dt.date == selected_date.date()]
	data_for_model.loc[:, 'text'] = data_for_model['text']
	first_row_text = data_for_model['text'].iloc[0]
	return first_row_text