dataset-creator-reddit-canadian-investor

Running

App Files Files Community

alvanli commited on Aug 29

Commit

67a3546

•

1 Parent(s): ce087e5

add comment ds

Browse files

Files changed (4) hide show

main.py +29 -12
utilities/praw_downloader.py +23 -4
utilities/praw_processor.py +19 -0
utilities/user_defined_functions.py +44 -3

main.py CHANGED Viewed

@@ -6,7 +6,14 @@ import pandas as pd
 import schedule
 from datasets import Dataset
-from utilities.user_defined_functions import get_latest_data, merge_data, load_or_create_dataset, remove_filtered_rows
 from utilities.my_logger import setup_logger
 from utilities.readme_update import update_dataset_readme
@@ -14,6 +21,8 @@ from utilities.readme_update import update_dataset_readme
 subreddit = os.environ["SUBREDDIT"]
 username = os.environ["USERNAME"]
 dataset_name = f"{username}/reddit-{subreddit}"
 dataset_readme_path = "README.md"
 frequency = os.environ.get("FREQUENCY", '').lower()
@@ -25,14 +34,9 @@ auth_token = os.environ["HF_TOKEN"]
 logger = setup_logger(__name__)
-def main():
     date = datetime.now().strftime('%Y-%m-%d')
-    logger.warning(f"Running main function for date: {date}")
-    dataset = load_or_create_dataset()
-    new_df = get_latest_data()
     # Using dataset from hub
     if 'train' in dataset.keys():
         old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
@@ -51,13 +55,26 @@ def main():
     logger.info(f"Adding {new_rows} rows for {date}.")
     # Push the augmented dataset to the Hugging Face hub
-    logger.debug(f"Pushing data for {date} to the Hugging Face hub")
-    dataset.push_to_hub(dataset_name, token=auth_token)
-    logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
-    update_dataset_readme(dataset_name=dataset_name, subreddit=subreddit, new_rows=new_rows)
     logger.info(f"Updated README.")
 def schedule_periodic_task():
     """
     Schedule the main task to run at the user-defined frequency

 import schedule
 from datasets import Dataset
+from utilities.user_defined_functions import (
+    get_latest_data,
+    merge_data,
+    load_or_create_dataset,
+    remove_filtered_rows,
+    load_or_create_comment_dataset
+)
 from utilities.my_logger import setup_logger
 from utilities.readme_update import update_dataset_readme
 subreddit = os.environ["SUBREDDIT"]
 username = os.environ["USERNAME"]
 dataset_name = f"{username}/reddit-{subreddit}"
+comment_dataset_name = f"{username}/reddit-comments-{subreddit}"
 dataset_readme_path = "README.md"
 frequency = os.environ.get("FREQUENCY", '').lower()
 logger = setup_logger(__name__)
+def upload(new_df, dataset, hf_dataset_name):
     date = datetime.now().strftime('%Y-%m-%d')
     # Using dataset from hub
     if 'train' in dataset.keys():
         old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
     logger.info(f"Adding {new_rows} rows for {date}.")
     # Push the augmented dataset to the Hugging Face hub
+    logger.debug(f"Pushing data for {date} to {hf_dataset_name}")
+    dataset.push_to_hub(hf_dataset_name, token=auth_token)
+    logger.info(f"Processed and pushed data for {date} to {hf_dataset_name}")
+    update_dataset_readme(dataset_name=hf_dataset_name, subreddit=subreddit, new_rows=new_rows)
     logger.info(f"Updated README.")
+def main():
+    date = datetime.now().strftime('%Y-%m-%d')
+    logger.warning(f"Running main function for date: {date}")
+    sub_dataset = load_or_create_dataset()
+    new_df, new_df_comment = get_latest_data()
+    upload(new_df, sub_dataset, dataset_name)
+    comment_dataset = load_or_create_comment_dataset()
+    upload(new_df_comment, comment_dataset, comment_dataset_name)
 def schedule_periodic_task():
     """
     Schedule the main task to run at the user-defined frequency

utilities/praw_downloader.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 from datetime import datetime
-from typing import Any, Dict, List
 import praw
@@ -37,8 +37,24 @@ def extract_submission_data(submission: praw.models.Submission) -> Dict[str, Any
         "nsfw": submission.over_18,
         }
-def praw_downloader() -> List[Dict[str, str]]:
     """Main function to extract and save all submissions from the subreddit."""
     reddit = get_reddit_instance()
     subreddit = reddit.subreddit(subreddit_var)
@@ -46,13 +62,16 @@ def praw_downloader() -> List[Dict[str, str]]:
     logger.info(f'Starting to fetch submissions from {os.getenv("SUBREDDIT")}.')
     submissions = []
     for submission in subreddit.new(limit=reddit_pull_limit):  # Set limit=None to get all posts
         # logger.debug(f'Processing post {submission.id} - {submission.title}')
         data = extract_submission_data(submission)
         submissions.append(data)
-    logger.info(f'Finished downloading {len(submissions)} submissions.')
-    return submissions
 if __name__ == "__main__":

 import os
 from datetime import datetime
+from typing import Any, Dict, List, Tuple
 import praw
         "nsfw": submission.over_18,
         }
+def extract_comment_data(comment: praw.models.Comment) -> Dict[str, Any]:
+    """Extract and return relevant data from a given Reddit comment"""
+    return {
+        'content': comment.body,
+        'poster': str(comment.author),
+        'date_utc': datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
+        'flair': comment.author_flair_text,
+        'ups': comment.ups,
+        'score': comment.score,
+        'permalink': comment.permalink,
+        'depth': comment.depth,
+        'link_id': comment.link_id,
+        'submission_id': comment._submission.id,
+        'id': comment.id
+    }
+def praw_downloader() -> Tuple[List[Dict[str, str]]]:
     """Main function to extract and save all submissions from the subreddit."""
     reddit = get_reddit_instance()
     subreddit = reddit.subreddit(subreddit_var)
     logger.info(f'Starting to fetch submissions from {os.getenv("SUBREDDIT")}.')
     submissions = []
+    comments = []
     for submission in subreddit.new(limit=reddit_pull_limit):  # Set limit=None to get all posts
         # logger.debug(f'Processing post {submission.id} - {submission.title}')
         data = extract_submission_data(submission)
+        for comment in submission.comments.list():
+            comments.append(extract_comment_data(comment))
         submissions.append(data)
+    logger.info(f'Finished downloading {len(submissions)} submissions, {len(comments)} comments')
+    return submissions, comments
 if __name__ == "__main__":

utilities/praw_processor.py CHANGED Viewed

@@ -33,3 +33,22 @@ def preprocess_praw_data(submissions: List[Dict]) -> pd.DataFrame:
     praw_df['id'] = praw_df.permalink.str.split('/').str[4]
     return praw_df

     praw_df['id'] = praw_df.permalink.str.split('/').str[4]
     return praw_df
+def preprocess_praw_comment_data(comments: List[Dict]) -> pd.DataFrame:
+    """
+    Preprocesses praw comment data into a DataFrame.
+    Parameters:
+    - submissions: List of submission dictionaries.
+    Returns:
+    - pd.DataFrame: Preprocessed DataFrame.
+    """
+    # Convert the submissions list to a DataFrame
+    praw_df = pd.DataFrame(comments)
+    # Convert 'date' column to datetime format
+    praw_df.date_utc = pd.to_datetime(praw_df.date_utc)
+    return praw_df

utilities/user_defined_functions.py CHANGED Viewed

@@ -9,12 +9,13 @@ from huggingface_hub import login
 from utilities.data_processing import data_processing
 from utilities.my_logger import setup_logger
 from utilities.praw_downloader import praw_downloader
-from utilities.praw_processor import preprocess_praw_data
 # Set dataset name, path to README.md, and existing dataset details
 subreddit = os.environ["SUBREDDIT"]
 username = os.environ["USERNAME"]
 dataset_name = f"{username}/reddit-{subreddit}"
 frequency = os.environ.get("FREQUENCY", '').lower()
 if frequency not in ["daily", "hourly"]:
@@ -41,6 +42,23 @@ dummy_data = {
     "nsfw": [False]
     }
 def load_or_create_dataset():
     """
@@ -77,6 +95,28 @@ def load_or_create_dataset():
     return dataset
 def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
     """
     Merges two dataframes, sorts them by 'date_utc', and marks new IDs.
@@ -125,6 +165,7 @@ def remove_filtered_rows(df: pd.DataFrame) -> pd.DataFrame:
 def get_latest_data():
-    submissions = praw_downloader()
     df = preprocess_praw_data(submissions=submissions)
-    return df

 from utilities.data_processing import data_processing
 from utilities.my_logger import setup_logger
 from utilities.praw_downloader import praw_downloader
+from utilities.praw_processor import preprocess_praw_data, preprocess_praw_comment_data
 # Set dataset name, path to README.md, and existing dataset details
 subreddit = os.environ["SUBREDDIT"]
 username = os.environ["USERNAME"]
 dataset_name = f"{username}/reddit-{subreddit}"
+comment_dataset_name = f"{username}/reddit-comments-{subreddit}"
 frequency = os.environ.get("FREQUENCY", '').lower()
 if frequency not in ["daily", "hourly"]:
     "nsfw": [False]
     }
+dummy_comment_data = {
+    "id": ['id'],
+    "content": ["This is a sample post content. Just for demonstration purposes!"],
+    "poster": ["sampleUser123"],
+    "date_utc": [datetime.strptime("2023-10-26 14:30:45", '%Y-%m-%d %H:%M:%S')],
+    "flair": ["Discussion"],
+    "title": ["Sample Post Title: How to Use Hugging Face?"],
+    "ups": [457],
+    "score": [457],
+    "permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
+    "updated": [False],
+    "new": [False],
+    "depth": [2],
+    "link_id": ["eqrkhgbjeh"],
+    "submission_id": ["eqrkhgbjeh"]
+}
 def load_or_create_dataset():
     """
     return dataset
+def load_or_create_comment_dataset():
+    # Load the existing dataset from the Hugging Face hub or create a new one
+    try:
+        logger.debug(f"Trying to download {comment_dataset_name}")
+        dataset = load_dataset(comment_dataset_name, download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        logger.debug("Loading existing comment dataset")
+    except FileNotFoundError:
+        logger.warning("Creating new comment dataset")
+        # Creating Initial Repo
+        dataset = DatasetDict()
+        dataset['train'] = Dataset.from_dict(dummy_comment_data)
+        dataset.push_to_hub(repo_id=comment_dataset_name, token=auth_token)
+        # Pulling from Initial Repo
+        dataset = load_dataset(comment_dataset_name)
+        # Remove dummy data
+        del dataset['train']
+    return dataset
 def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
     """
     Merges two dataframes, sorts them by 'date_utc', and marks new IDs.
 def get_latest_data():
+    submissions, comments = praw_downloader()
     df = preprocess_praw_data(submissions=submissions)
+    df_comments = preprocess_praw_comment_data(comments=comments)
+    return df, df_comments