from typing import Dict, List import pandas as pd from utilities.my_logger import setup_logger # Setup logging logger = setup_logger(__name__) def preprocess_praw_data(submissions: List[Dict]) -> pd.DataFrame: """ Preprocesses praw data into a DataFrame. Parameters: - submissions: List of submission dictionaries. Returns: - pd.DataFrame: Preprocessed DataFrame. """ # Convert the submissions list to a DataFrame praw_df = pd.DataFrame(submissions) # Convert 'date' column to datetime format praw_df.date_utc = pd.to_datetime(praw_df.date_utc) # Remove 'poster_link' column if it exists if 'poster_link' in praw_df.columns: del praw_df['poster_link'] # Extract the 4th element from 'permalink' as 'id' praw_df['id'] = praw_df.permalink.str.split('/').str[4] return praw_df def preprocess_praw_comment_data(comments: List[Dict]) -> pd.DataFrame: """ Preprocesses praw comment data into a DataFrame. Parameters: - submissions: List of submission dictionaries. Returns: - pd.DataFrame: Preprocessed DataFrame. """ # Convert the submissions list to a DataFrame praw_df = pd.DataFrame(comments) # Convert 'date' column to datetime format praw_df.date_utc = pd.to_datetime(praw_df.date_utc) return praw_df