Commit
•
f50c7ba
1
Parent(s):
904b8a3
Loading datset once
Browse files
main.py
CHANGED
@@ -23,27 +23,17 @@ login(auth_token, add_to_git_credential=True)
|
|
23 |
logger = setup_logger(__name__)
|
24 |
|
25 |
|
26 |
-
def main(date_to_fetch):
|
27 |
"""
|
28 |
Runs the main data processing function to fetch and process subreddit data for the specified date.
|
29 |
|
30 |
Args:
|
31 |
-
|
|
|
32 |
|
33 |
Returns:
|
34 |
-
most_recent_date (str):
|
35 |
"""
|
36 |
-
|
37 |
-
# Load the existing dataset from the Hugging Face hub or create a new one
|
38 |
-
try:
|
39 |
-
dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
|
40 |
-
logger.debug("Loading existing dataset")
|
41 |
-
if "__index_level_0__" in dataset["all_days"].column_names:
|
42 |
-
dataset = dataset.remove_columns(["__index_level_0__"])
|
43 |
-
except FileNotFoundError:
|
44 |
-
logger.warning("Creating new dataset")
|
45 |
-
dataset = DatasetDict()
|
46 |
-
|
47 |
# Call get_subreddit_day with the calculated date
|
48 |
logger.info(f"Fetching data for {str(date_to_fetch)}")
|
49 |
submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
|
@@ -102,13 +92,23 @@ def run_main_continuously():
|
|
102 |
# Calculate the start time for running the main_function every day.
|
103 |
start_time = datetime.now().time()
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
while True:
|
106 |
today = datetime.now().date()
|
107 |
two_days_ago = today - timedelta(days=2)
|
108 |
|
109 |
if start_date <= two_days_ago:
|
110 |
logger.warning(f"Running main function for date: {start_date}")
|
111 |
-
most_recent_date = main(start_date)
|
112 |
start_date = most_recent_date + timedelta(days=1)
|
113 |
else:
|
114 |
tomorrow = today + timedelta(days=1)
|
|
|
23 |
logger = setup_logger(__name__)
|
24 |
|
25 |
|
26 |
+
def main(dataset, date_to_fetch):
|
27 |
"""
|
28 |
Runs the main data processing function to fetch and process subreddit data for the specified date.
|
29 |
|
30 |
Args:
|
31 |
+
dataset (datasets.DatasetDict): The Hugging Face dataset to fetch and process subreddit data for.
|
32 |
+
date_to_fetch (str): The date to fetch subreddit data for, in YYYY-MM-DD format.
|
33 |
|
34 |
Returns:
|
35 |
+
most_recent_date (str): The most recent date in the updated dataset.
|
36 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
# Call get_subreddit_day with the calculated date
|
38 |
logger.info(f"Fetching data for {str(date_to_fetch)}")
|
39 |
submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
|
|
|
92 |
# Calculate the start time for running the main_function every day.
|
93 |
start_time = datetime.now().time()
|
94 |
|
95 |
+
# Load the existing dataset from the Hugging Face hub or create a new one
|
96 |
+
try:
|
97 |
+
dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
|
98 |
+
logger.debug("Loading existing dataset")
|
99 |
+
if "__index_level_0__" in dataset["all_days"].column_names:
|
100 |
+
dataset = dataset.remove_columns(["__index_level_0__"])
|
101 |
+
except FileNotFoundError:
|
102 |
+
logger.warning("Creating new dataset")
|
103 |
+
dataset = DatasetDict()
|
104 |
+
|
105 |
while True:
|
106 |
today = datetime.now().date()
|
107 |
two_days_ago = today - timedelta(days=2)
|
108 |
|
109 |
if start_date <= two_days_ago:
|
110 |
logger.warning(f"Running main function for date: {start_date}")
|
111 |
+
most_recent_date = main(dataset, start_date)
|
112 |
start_date = most_recent_date + timedelta(days=1)
|
113 |
else:
|
114 |
tomorrow = today + timedelta(days=1)
|