Commit
•
bcf2055
1
Parent(s):
76a52b4
Adding code to show new rows
Browse files- main.py +6 -2
- utilities/data_collator.py +15 -8
- utilities/praw_downloader.py +3 -1
main.py
CHANGED
@@ -7,7 +7,7 @@ import schedule
|
|
7 |
from datasets import Dataset, DatasetDict, load_dataset
|
8 |
from huggingface_hub import login
|
9 |
|
10 |
-
from utilities.data_collator import get_latest_data,
|
11 |
from utilities.my_logger import setup_logger
|
12 |
from utilities.praw_downloader import dummy_data
|
13 |
from utilities.readme_update import update_readme
|
@@ -57,10 +57,13 @@ def main():
|
|
57 |
|
58 |
# Get Latest Data and merge with historic data
|
59 |
new_df = get_latest_data()
|
|
|
|
|
60 |
if 'train' in dataset.keys():
|
61 |
old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
|
62 |
-
df =
|
63 |
new_rows = len(df) - len(old_df)
|
|
|
64 |
else:
|
65 |
df = new_df
|
66 |
new_rows = len(new_df)
|
@@ -80,6 +83,7 @@ def schedule_periodic_task():
|
|
80 |
"""
|
81 |
Schedule the main task to run at the user-defined frequency
|
82 |
"""
|
|
|
83 |
if frequency == 'hourly':
|
84 |
logger.info(f'Scheduling tasks to run every hour at the top of the hour')
|
85 |
schedule.every().hour.at(":00").do(main)
|
|
|
7 |
from datasets import Dataset, DatasetDict, load_dataset
|
8 |
from huggingface_hub import login
|
9 |
|
10 |
+
from utilities.data_collator import get_latest_data, merge_data
|
11 |
from utilities.my_logger import setup_logger
|
12 |
from utilities.praw_downloader import dummy_data
|
13 |
from utilities.readme_update import update_readme
|
|
|
57 |
|
58 |
# Get Latest Data and merge with historic data
|
59 |
new_df = get_latest_data()
|
60 |
+
|
61 |
+
# Using dataset from hub
|
62 |
if 'train' in dataset.keys():
|
63 |
old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
|
64 |
+
df = merge_data(old_df=old_df, new_df=new_df)
|
65 |
new_rows = len(df) - len(old_df)
|
66 |
+
# New dataset
|
67 |
else:
|
68 |
df = new_df
|
69 |
new_rows = len(new_df)
|
|
|
83 |
"""
|
84 |
Schedule the main task to run at the user-defined frequency
|
85 |
"""
|
86 |
+
main()
|
87 |
if frequency == 'hourly':
|
88 |
logger.info(f'Scheduling tasks to run every hour at the top of the hour')
|
89 |
schedule.every().hour.at(":00").do(main)
|
utilities/data_collator.py
CHANGED
@@ -42,7 +42,6 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
|
42 |
# Merge the two DataFrames on 'id'
|
43 |
df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
|
44 |
|
45 |
-
|
46 |
# Check if the content or score was updated for each id
|
47 |
df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
|
48 |
df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
|
@@ -57,27 +56,35 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
|
57 |
return df_merged
|
58 |
|
59 |
|
60 |
-
def
|
61 |
"""
|
62 |
-
Merges two dataframes, sorts them by 'date_utc', and
|
63 |
|
64 |
-
The function first concatenates the old and new dataframes.
|
65 |
-
resulting dataframe by the 'date_utc' column.
|
66 |
-
|
67 |
|
68 |
Args:
|
69 |
- old_df (pd.DataFrame): The original dataframe.
|
70 |
- new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.
|
71 |
|
72 |
Returns:
|
73 |
-
- pd.DataFrame: The merged, sorted, and
|
74 |
"""
|
75 |
|
|
|
|
|
|
|
|
|
76 |
# Concatenate old and new dataframes, sort by 'date_utc', and reset index
|
77 |
df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
|
78 |
|
79 |
-
#
|
80 |
df = filter_redundant_ids(df)
|
|
|
|
|
|
|
|
|
81 |
return df
|
82 |
|
83 |
|
|
|
42 |
# Merge the two DataFrames on 'id'
|
43 |
df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
|
44 |
|
|
|
45 |
# Check if the content or score was updated for each id
|
46 |
df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
|
47 |
df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
|
|
|
56 |
return df_merged
|
57 |
|
58 |
|
59 |
+
def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
|
60 |
"""
|
61 |
+
Merges two dataframes, sorts them by 'date_utc', and marks new IDs.
|
62 |
|
63 |
+
The function first marks rows from the new dataframe, then concatenates the old and new dataframes.
|
64 |
+
It sorts the resulting dataframe by the 'date_utc' column. Rows from the new dataframe that are not
|
65 |
+
in the old dataframe are marked as 'new'.
|
66 |
|
67 |
Args:
|
68 |
- old_df (pd.DataFrame): The original dataframe.
|
69 |
- new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.
|
70 |
|
71 |
Returns:
|
72 |
+
- pd.DataFrame: The merged, sorted, and marked dataframe.
|
73 |
"""
|
74 |
|
75 |
+
# Mark rows in old and new dataframes
|
76 |
+
old_df['new'] = False
|
77 |
+
new_df['new'] = True
|
78 |
+
|
79 |
# Concatenate old and new dataframes, sort by 'date_utc', and reset index
|
80 |
df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
|
81 |
|
82 |
+
# Optional: If you have a function to filter redundant IDs, you can use it here
|
83 |
df = filter_redundant_ids(df)
|
84 |
+
|
85 |
+
# Identify new rows (present in new_df but not in old_df)
|
86 |
+
df['new'] = df['new'] & ~df['id'].duplicated(keep=False)
|
87 |
+
|
88 |
return df
|
89 |
|
90 |
|
utilities/praw_downloader.py
CHANGED
@@ -15,6 +15,7 @@ reddit_pull_limit = int(os.getenv("REDDIT_PULL_LIMIT"))
|
|
15 |
|
16 |
# Dummy row for when we create a new repo
|
17 |
dummy_data = {
|
|
|
18 |
"content": ["This is a sample post content. Just for demonstration purposes!"],
|
19 |
"poster": ["sampleUser123"],
|
20 |
"date_utc": [datetime.strptime("2023-10-26 14:30:45", '%Y-%m-%d %H:%M:%S')],
|
@@ -22,7 +23,8 @@ dummy_data = {
|
|
22 |
"title": ["Sample Post Title: How to Use Hugging Face?"],
|
23 |
"score": [457],
|
24 |
"permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
|
25 |
-
"
|
|
|
26 |
}
|
27 |
|
28 |
|
|
|
15 |
|
16 |
# Dummy row for when we create a new repo
|
17 |
dummy_data = {
|
18 |
+
"id": ['id'],
|
19 |
"content": ["This is a sample post content. Just for demonstration purposes!"],
|
20 |
"poster": ["sampleUser123"],
|
21 |
"date_utc": [datetime.strptime("2023-10-26 14:30:45", '%Y-%m-%d %H:%M:%S')],
|
|
|
23 |
"title": ["Sample Post Title: How to Use Hugging Face?"],
|
24 |
"score": [457],
|
25 |
"permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
|
26 |
+
"updated": False,
|
27 |
+
"new": False,
|
28 |
}
|
29 |
|
30 |
|