derek-thomas HF staff commited on
Commit
f50c7ba
1 Parent(s): 904b8a3

Loading datset once

Browse files
Files changed (1) hide show
  1. main.py +15 -15
main.py CHANGED
@@ -23,27 +23,17 @@ login(auth_token, add_to_git_credential=True)
23
  logger = setup_logger(__name__)
24
 
25
 
26
- def main(date_to_fetch):
27
  """
28
  Runs the main data processing function to fetch and process subreddit data for the specified date.
29
 
30
  Args:
31
- date_to_fetch (datetime.date): The date to fetch subreddit data for
 
32
 
33
  Returns:
34
- most_recent_date (str): Most recent date in dataset
35
  """
36
-
37
- # Load the existing dataset from the Hugging Face hub or create a new one
38
- try:
39
- dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
40
- logger.debug("Loading existing dataset")
41
- if "__index_level_0__" in dataset["all_days"].column_names:
42
- dataset = dataset.remove_columns(["__index_level_0__"])
43
- except FileNotFoundError:
44
- logger.warning("Creating new dataset")
45
- dataset = DatasetDict()
46
-
47
  # Call get_subreddit_day with the calculated date
48
  logger.info(f"Fetching data for {str(date_to_fetch)}")
49
  submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
@@ -102,13 +92,23 @@ def run_main_continuously():
102
  # Calculate the start time for running the main_function every day.
103
  start_time = datetime.now().time()
104
 
 
 
 
 
 
 
 
 
 
 
105
  while True:
106
  today = datetime.now().date()
107
  two_days_ago = today - timedelta(days=2)
108
 
109
  if start_date <= two_days_ago:
110
  logger.warning(f"Running main function for date: {start_date}")
111
- most_recent_date = main(start_date)
112
  start_date = most_recent_date + timedelta(days=1)
113
  else:
114
  tomorrow = today + timedelta(days=1)
 
23
  logger = setup_logger(__name__)
24
 
25
 
26
+ def main(dataset, date_to_fetch):
27
  """
28
  Runs the main data processing function to fetch and process subreddit data for the specified date.
29
 
30
  Args:
31
+ dataset (datasets.DatasetDict): The Hugging Face dataset to fetch and process subreddit data for.
32
+ date_to_fetch (str): The date to fetch subreddit data for, in YYYY-MM-DD format.
33
 
34
  Returns:
35
+ most_recent_date (str): The most recent date in the updated dataset.
36
  """
 
 
 
 
 
 
 
 
 
 
 
37
  # Call get_subreddit_day with the calculated date
38
  logger.info(f"Fetching data for {str(date_to_fetch)}")
39
  submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
 
92
  # Calculate the start time for running the main_function every day.
93
  start_time = datetime.now().time()
94
 
95
+ # Load the existing dataset from the Hugging Face hub or create a new one
96
+ try:
97
+ dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
98
+ logger.debug("Loading existing dataset")
99
+ if "__index_level_0__" in dataset["all_days"].column_names:
100
+ dataset = dataset.remove_columns(["__index_level_0__"])
101
+ except FileNotFoundError:
102
+ logger.warning("Creating new dataset")
103
+ dataset = DatasetDict()
104
+
105
  while True:
106
  today = datetime.now().date()
107
  two_days_ago = today - timedelta(days=2)
108
 
109
  if start_date <= two_days_ago:
110
  logger.warning(f"Running main function for date: {start_date}")
111
+ most_recent_date = main(dataset, start_date)
112
  start_date = most_recent_date + timedelta(days=1)
113
  else:
114
  tomorrow = today + timedelta(days=1)