import pandas as pd import zipfile import re from io import BytesIO def detect_file_type(file_path): type = file_path[-3:] print(type) if type in ["txt","zip"]: return type else: return "unknown" def preprocess_whatsapp_messages(file_path, file_type): """ Preprocesses the Whatsapp messages zip file into a Pandas Dataframe, all messages in one day go to a row and a timestamp is added. Args: file_path (str): Location of the file (zip or txt) of the conversation. Returns: str: Dataframe """ # Load the zip file and extract text data print(file_type) if file_type == "zip": with zipfile.ZipFile(file_path, 'r') as z: file_name = z.namelist()[0] with z.open(file_name) as file: text_data = file.read().decode('utf-8') else: text_data = BytesIO(file_path.getvalue()).read().decode('utf-8') # Split the text data into lines lines = text_data.strip().split('\n') # Create a DataFrame df = pd.DataFrame(lines, columns=['message']) # Process each line to separate timestamp and text df[['timestamp', 'text']] = df['message'].str.split(']', n=1, expand=True) df['timestamp'] = df['timestamp'].str.strip('[') # Handle cases where the split might not work (e.g., missing ']' in a line) df.dropna(subset=['timestamp', 'text'], inplace=True) # Convert timestamp to datetime and remove the time, keeping only the date df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%y, %H:%M:%S', errors='coerce').dt.date # Drop rows where the timestamp conversion failed (which results in NaT) df.dropna(subset=['timestamp'], inplace=True) # Remove initial WhatsApp system messages in English and Spanish filter_text_en = "Your messages and calls are end-to-end encrypted" filter_text_es = "Los mensajes y las llamadas están cifrados de extremo a extremo" df = df[~df['text'].str.contains(filter_text_en, na=False)] df = df[~df['text'].str.contains(filter_text_es, na=False)] # Additional preprocessing steps: # Remove URLs and convert text to lowercase df['text'] = df['text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x)) # Remove URLs df['text'] = df['text'].apply(lambda x: x.lower()) # Convert text to lowercase # Remove emojis, images, stickers, documents while preserving colons after sender names df['text'] = df['text'].apply(lambda x: re.sub(r'(?', '', x)) # Remove editing function (new Whatsapp addition) in Spanish df['text'] = df['text'].apply(lambda x: re.sub(r'', '', x)) # Remove editing function (new Whatsapp addition) in English I AM GUESSING IDk # Group by date and concatenate all messages from the same date df = df.groupby('timestamp')['text'].apply(lambda x: '\n'.join(x)).reset_index() df.columns = ['date', 'text'] df['date'] = pd.to_datetime(df['date']) df['text'] = df['text'].astype(str) return df def get_dated_input(data, selected_date): ''' The Pandas dataframe is processed and the text is extracted. :param data: :param selected_date: :return: ''' selected_date = pd.to_datetime(selected_date) data_for_model = data[data['date'].dt.date == selected_date.date()] data_for_model.loc[:, 'text'] = data_for_model['text'] first_row_text = data_for_model['text'].iloc[0] return first_row_text