Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

App Files Files Community

derek-thomas HF staff commited on Apr 15, 2023

Commit

f50c7ba

•

1 Parent(s): 904b8a3

Loading datset once

Browse files

Files changed (1) hide show

main.py +15 -15

main.py CHANGED Viewed

@@ -23,27 +23,17 @@ login(auth_token, add_to_git_credential=True)
 logger = setup_logger(__name__)
-def main(date_to_fetch):
     """
     Runs the main data processing function to fetch and process subreddit data for the specified date.
     Args:
-        date_to_fetch (datetime.date): The date to fetch subreddit data for
     Returns:
-        most_recent_date (str): Most recent date in dataset
     """
-    # Load the existing dataset from the Hugging Face hub or create a new one
-    try:
-        dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
-        logger.debug("Loading existing dataset")
-        if "__index_level_0__" in dataset["all_days"].column_names:
-            dataset = dataset.remove_columns(["__index_level_0__"])
-    except FileNotFoundError:
-        logger.warning("Creating new dataset")
-        dataset = DatasetDict()
     # Call get_subreddit_day with the calculated date
     logger.info(f"Fetching data for {str(date_to_fetch)}")
     submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
@@ -102,13 +92,23 @@ def run_main_continuously():
     # Calculate the start time for running the main_function every day.
     start_time = datetime.now().time()
     while True:
         today = datetime.now().date()
         two_days_ago = today - timedelta(days=2)
         if start_date <= two_days_ago:
             logger.warning(f"Running main function for date: {start_date}")
-            most_recent_date = main(start_date)
             start_date = most_recent_date + timedelta(days=1)
         else:
             tomorrow = today + timedelta(days=1)

 logger = setup_logger(__name__)
+def main(dataset, date_to_fetch):
     """
     Runs the main data processing function to fetch and process subreddit data for the specified date.
     Args:
+        dataset (datasets.DatasetDict): The Hugging Face dataset to fetch and process subreddit data for.
+        date_to_fetch (str): The date to fetch subreddit data for, in YYYY-MM-DD format.
     Returns:
+        most_recent_date (str): The most recent date in the updated dataset.
     """
     # Call get_subreddit_day with the calculated date
     logger.info(f"Fetching data for {str(date_to_fetch)}")
     submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
     # Calculate the start time for running the main_function every day.
     start_time = datetime.now().time()
+    # Load the existing dataset from the Hugging Face hub or create a new one
+    try:
+        dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
+        logger.debug("Loading existing dataset")
+        if "__index_level_0__" in dataset["all_days"].column_names:
+            dataset = dataset.remove_columns(["__index_level_0__"])
+    except FileNotFoundError:
+        logger.warning("Creating new dataset")
+        dataset = DatasetDict()
     while True:
         today = datetime.now().date()
         two_days_ago = today - timedelta(days=2)
         if start_date <= two_days_ago:
             logger.warning(f"Running main function for date: {start_date}")
+            most_recent_date = main(dataset, start_date)
             start_date = most_recent_date + timedelta(days=1)
         else:
             tomorrow = today + timedelta(days=1)