Commit
•
5d9e0b8
1
Parent(s):
2870060
Major updates from sister repo
Browse files- app.py +66 -9
- main.py +38 -17
- my_logger.py +0 -22
- requirements.txt +3 -1
- utilities/data_collator.py +20 -11
- utilities/praw_downloader.py +19 -3
- utilities/readme_update.py +14 -5
app.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
from pathlib import Path
|
3 |
|
4 |
import gradio as gr
|
|
|
5 |
from rich.console import Console
|
6 |
from rich.syntax import Syntax
|
7 |
|
@@ -11,6 +12,10 @@ subreddit = os.environ["SUBREDDIT"]
|
|
11 |
username = os.environ["USERNAME"]
|
12 |
dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
|
13 |
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def log_file_to_html_string():
|
16 |
log_file = "mylog.log"
|
@@ -27,27 +32,75 @@ def log_file_to_html_string():
|
|
27 |
output = "".join(lines)
|
28 |
syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
|
29 |
|
30 |
-
console.print(syntax)
|
31 |
html_content = console.export_html(inline_styles=True)
|
32 |
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
pre, code {
|
35 |
background-color: #272822;
|
36 |
}
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
40 |
|
|
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
45 |
|
46 |
As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
|
47 |
"""
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
with gr.Blocks() as demo:
|
50 |
-
gr.
|
|
|
51 |
gr.Image(proj_dir / 'media' / 'reddit_scraper.drawio.png')
|
52 |
gr.Markdown("# Logs")
|
53 |
output = gr.HTML(log_file_to_html_string, every=1)
|
@@ -58,6 +111,10 @@ with gr.Blocks() as demo:
|
|
58 |
document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
|
59 |
}
|
60 |
""", )
|
|
|
|
|
|
|
|
|
61 |
|
62 |
if __name__ == '__main__':
|
63 |
demo.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|
|
|
2 |
from pathlib import Path
|
3 |
|
4 |
import gradio as gr
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
from rich.console import Console
|
7 |
from rich.syntax import Syntax
|
8 |
|
|
|
12 |
username = os.environ["USERNAME"]
|
13 |
dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
|
14 |
|
15 |
+
frequency = os.environ.get("FREQUENCY", '').lower()
|
16 |
+
if frequency not in ["daily", "hourly"]:
|
17 |
+
raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'")
|
18 |
+
|
19 |
|
20 |
def log_file_to_html_string():
|
21 |
log_file = "mylog.log"
|
|
|
32 |
output = "".join(lines)
|
33 |
syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
|
34 |
|
35 |
+
console.print(syntax);
|
36 |
html_content = console.export_html(inline_styles=True)
|
37 |
|
38 |
+
# Parse the HTML content using BeautifulSoup
|
39 |
+
soup = BeautifulSoup(html_content, 'lxml')
|
40 |
+
|
41 |
+
# Modify the <pre> tag
|
42 |
+
pre_tag = soup.pre
|
43 |
+
pre_tag['class'] = 'scrollable'
|
44 |
+
del pre_tag['style']
|
45 |
+
|
46 |
+
# Add your custom styles and the .scrollable CSS to the <style> tag
|
47 |
+
style_tag = soup.style
|
48 |
+
style_content = """
|
49 |
pre, code {
|
50 |
background-color: #272822;
|
51 |
}
|
52 |
+
.scrollable {
|
53 |
+
font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace;
|
54 |
+
height: 500px;
|
55 |
+
overflow: auto;
|
56 |
+
}
|
57 |
+
"""
|
58 |
+
style_tag.append(style_content)
|
59 |
|
60 |
+
return soup.prettify()
|
61 |
|
62 |
+
|
63 |
+
intro_md = f"""
|
64 |
+
# Reddit Dataset Creator
|
65 |
+
This is a reddit dataset creator which builds and updates [{dataset_name}](https://huggingface.co/datasets/{dataset_name})
|
66 |
+
which pulls from [/r/{subreddit}](http://www.reddit.com/r/{subreddit}). Check the dataset for more details.
|
67 |
|
68 |
As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
|
69 |
"""
|
70 |
|
71 |
+
how_to_md = f"""
|
72 |
+
# How to make your own space and dataset
|
73 |
+
1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use'
|
74 |
+
- Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri'
|
75 |
+
- You need the `secret` and the `Client ID` from the reddit application.
|
76 |
+
- `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too.
|
77 |
+
2. Get your writable [huggingface token](https://huggingface.co/settings/tokens)
|
78 |
+
3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a>
|
79 |
+
and fill in the information
|
80 |
+
"""
|
81 |
+
|
82 |
+
how_does_it_work_md = f"""
|
83 |
+
# Core Components
|
84 |
+
There are 2 core components [main](main.py) and [app](app.py).
|
85 |
+
Main does a few things:
|
86 |
+
- Pulls from a datasource
|
87 |
+
- Updates a dataset on the hub
|
88 |
+
- Updates the README of the dataset
|
89 |
+
- Writes a local log file (inaccessible outside the spaces container)
|
90 |
+
|
91 |
+
App
|
92 |
+
- Visualizes the log file from Main
|
93 |
+
|
94 |
+
# Running it
|
95 |
+
This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the
|
96 |
+
log files. I use gradio for `app` and map that to the open port of huggingface spaces.
|
97 |
+
|
98 |
+
The only communication between `app` and `main` is the log file.
|
99 |
+
"""
|
100 |
+
|
101 |
with gr.Blocks() as demo:
|
102 |
+
with gr.Tab("Application"):
|
103 |
+
gr.Markdown(intro_md)
|
104 |
gr.Image(proj_dir / 'media' / 'reddit_scraper.drawio.png')
|
105 |
gr.Markdown("# Logs")
|
106 |
output = gr.HTML(log_file_to_html_string, every=1)
|
|
|
111 |
document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
|
112 |
}
|
113 |
""", )
|
114 |
+
with gr.Tab("How to Create?"):
|
115 |
+
gr.Markdown(how_to_md)
|
116 |
+
with gr.Tab("How does it work?"):
|
117 |
+
gr.Markdown(how_does_it_work_md)
|
118 |
|
119 |
if __name__ == '__main__':
|
120 |
demo.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|
main.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
import os
|
2 |
import time
|
3 |
-
from datetime import datetime
|
4 |
|
5 |
import pandas as pd
|
6 |
import schedule
|
7 |
-
from datasets import DatasetDict, load_dataset
|
8 |
from huggingface_hub import login
|
9 |
|
10 |
-
from utilities.data_collator import merge_and_filter_data
|
11 |
from utilities.my_logger import setup_logger
|
|
|
12 |
from utilities.readme_update import update_readme
|
13 |
|
14 |
# Set dataset name, path to README.md, and existing dataset details
|
@@ -17,6 +18,10 @@ username = os.environ["USERNAME"]
|
|
17 |
dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
|
18 |
dataset_readme_path = "README.md"
|
19 |
|
|
|
|
|
|
|
|
|
20 |
# Authenticate with Hugging Face using an auth token
|
21 |
auth_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
|
22 |
login(auth_token, add_to_git_credential=True)
|
@@ -27,13 +32,21 @@ logger = setup_logger(__name__)
|
|
27 |
def get_dataset():
|
28 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
29 |
try:
|
30 |
-
dataset = load_dataset(dataset_name
|
31 |
logger.debug("Loading existing dataset")
|
32 |
-
if "__index_level_0__" in dataset["train"].column_names:
|
33 |
-
dataset = dataset.remove_columns(["__index_level_0__"])
|
34 |
except FileNotFoundError:
|
35 |
logger.warning("Creating new dataset")
|
|
|
|
|
36 |
dataset = DatasetDict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
return dataset
|
38 |
|
39 |
|
@@ -43,12 +56,17 @@ def main():
|
|
43 |
dataset = get_dataset()
|
44 |
|
45 |
# Get Latest Data and merge with historic data
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
# Update README
|
51 |
-
new_rows = len(new_df) - len(old_df)
|
52 |
update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date, new_rows=new_rows)
|
53 |
logger.info(f"Adding {new_rows} rows for {date}.")
|
54 |
|
@@ -58,14 +76,17 @@ def main():
|
|
58 |
logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
|
59 |
|
60 |
|
61 |
-
def
|
62 |
"""
|
63 |
-
Schedule the
|
64 |
"""
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
69 |
|
70 |
while True:
|
71 |
schedule.run_pending()
|
@@ -73,4 +94,4 @@ def schedule_daily_task():
|
|
73 |
|
74 |
|
75 |
if __name__ == "__main__":
|
76 |
-
|
|
|
1 |
import os
|
2 |
import time
|
3 |
+
from datetime import datetime
|
4 |
|
5 |
import pandas as pd
|
6 |
import schedule
|
7 |
+
from datasets import Dataset, DatasetDict, load_dataset
|
8 |
from huggingface_hub import login
|
9 |
|
10 |
+
from utilities.data_collator import get_latest_data, merge_and_filter_data
|
11 |
from utilities.my_logger import setup_logger
|
12 |
+
from utilities.praw_downloader import dummy_data
|
13 |
from utilities.readme_update import update_readme
|
14 |
|
15 |
# Set dataset name, path to README.md, and existing dataset details
|
|
|
18 |
dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
|
19 |
dataset_readme_path = "README.md"
|
20 |
|
21 |
+
frequency = os.environ.get("FREQUENCY", '').lower()
|
22 |
+
if frequency not in ["daily", "hourly"]:
|
23 |
+
raise ValueError("FREQUENCY environment variable must be 'daily' or 'hourly'")
|
24 |
+
|
25 |
# Authenticate with Hugging Face using an auth token
|
26 |
auth_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
|
27 |
login(auth_token, add_to_git_credential=True)
|
|
|
32 |
def get_dataset():
|
33 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
34 |
try:
|
35 |
+
dataset = load_dataset(dataset_name)
|
36 |
logger.debug("Loading existing dataset")
|
|
|
|
|
37 |
except FileNotFoundError:
|
38 |
logger.warning("Creating new dataset")
|
39 |
+
|
40 |
+
# Creating Initial Repo
|
41 |
dataset = DatasetDict()
|
42 |
+
dataset['train'] = Dataset.from_dict(dummy_data)
|
43 |
+
dataset.push_to_hub(repo_id=dataset_name, token=auth_token)
|
44 |
+
|
45 |
+
# Pulling from Initial Repo
|
46 |
+
dataset = load_dataset(dataset_name)
|
47 |
+
|
48 |
+
# Remove dummy data
|
49 |
+
del dataset['train']
|
50 |
return dataset
|
51 |
|
52 |
|
|
|
56 |
dataset = get_dataset()
|
57 |
|
58 |
# Get Latest Data and merge with historic data
|
59 |
+
new_df = get_latest_data()
|
60 |
+
if 'train' in dataset.keys():
|
61 |
+
old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
|
62 |
+
df = merge_and_filter_data(old_df=old_df, new_df=new_df)
|
63 |
+
new_rows = len(df) - len(old_df)
|
64 |
+
else:
|
65 |
+
df = new_df
|
66 |
+
new_rows = len(new_df)
|
67 |
+
dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
|
68 |
|
69 |
# Update README
|
|
|
70 |
update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date, new_rows=new_rows)
|
71 |
logger.info(f"Adding {new_rows} rows for {date}.")
|
72 |
|
|
|
76 |
logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
|
77 |
|
78 |
|
79 |
+
def schedule_periodic_task():
|
80 |
"""
|
81 |
+
Schedule the main task to run at the user-defined frequency
|
82 |
"""
|
83 |
+
if frequency == 'hourly':
|
84 |
+
logger.info(f'Scheduling tasks to run every hour at the top of the hour')
|
85 |
+
schedule.every().hour.at(":00").do(main)
|
86 |
+
elif frequency == 'daily':
|
87 |
+
start_time = '05:00'
|
88 |
+
logger.info(f'Scheduling tasks to run every day at: {start_time} UTC+00')
|
89 |
+
schedule.every().day.at(start_time).do(main)
|
90 |
|
91 |
while True:
|
92 |
schedule.run_pending()
|
|
|
94 |
|
95 |
|
96 |
if __name__ == "__main__":
|
97 |
+
schedule_periodic_task()
|
my_logger.py
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
|
3 |
-
|
4 |
-
def setup_logger(name: str):
|
5 |
-
logger = logging.getLogger(name)
|
6 |
-
logger.setLevel(logging.DEBUG)
|
7 |
-
|
8 |
-
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
9 |
-
|
10 |
-
# Create a file handler to write logs to a file
|
11 |
-
file_handler = logging.FileHandler('mylog.log')
|
12 |
-
file_handler.setLevel(logging.DEBUG)
|
13 |
-
file_handler.setFormatter(formatter)
|
14 |
-
logger.addHandler(file_handler)
|
15 |
-
|
16 |
-
# Create a stream handler to write logs to the console
|
17 |
-
stream_handler = logging.StreamHandler()
|
18 |
-
stream_handler.setLevel(logging.DEBUG)
|
19 |
-
stream_handler.setFormatter(formatter)
|
20 |
-
logger.addHandler(stream_handler)
|
21 |
-
|
22 |
-
return logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -6,4 +6,6 @@ requests==2.28.2
|
|
6 |
loguru==0.7.0
|
7 |
rich==13.3.4
|
8 |
supervisor==4.2.5
|
9 |
-
schedule==1.2.0
|
|
|
|
|
|
6 |
loguru==0.7.0
|
7 |
rich==13.3.4
|
8 |
supervisor==4.2.5
|
9 |
+
schedule==1.2.0
|
10 |
+
beautifulsoup4==4.12.2
|
11 |
+
lxml==4.9.3
|
utilities/data_collator.py
CHANGED
@@ -29,7 +29,7 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
|
29 |
|
30 |
# Find row with the longest content for each 'id'
|
31 |
idx_longest_content = df.groupby('id')['content_length'].idxmax().values
|
32 |
-
df_longest_content = df.loc[idx_longest_content][
|
33 |
|
34 |
# Find row with the highest score for each 'id'
|
35 |
idx_highest_score = df.groupby('id')['score'].idxmax().values
|
@@ -41,20 +41,28 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
|
41 |
return df_merged
|
42 |
|
43 |
|
44 |
-
|
|
|
|
|
45 |
"""
|
46 |
-
Merges
|
47 |
-
|
|
|
|
|
|
|
48 |
|
49 |
Args:
|
50 |
-
-
|
|
|
51 |
|
52 |
Returns:
|
53 |
-
-
|
54 |
"""
|
55 |
-
latest_df = get_latest_data()
|
56 |
|
57 |
-
|
|
|
|
|
|
|
58 |
df = filter_redundant_ids(df)
|
59 |
return df
|
60 |
|
@@ -62,9 +70,10 @@ def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
|
|
62 |
if __name__ == '__main__':
|
63 |
# Mock data
|
64 |
data = {
|
65 |
-
'id': [1, 1, 2, 2, 3
|
66 |
-
'content': ['short', 'longer content', '
|
67 |
-
'score': [10, 5,
|
|
|
68 |
}
|
69 |
|
70 |
df = pd.DataFrame(data)
|
|
|
29 |
|
30 |
# Find row with the longest content for each 'id'
|
31 |
idx_longest_content = df.groupby('id')['content_length'].idxmax().values
|
32 |
+
df_longest_content = df.loc[idx_longest_content].drop(columns=['score'])
|
33 |
|
34 |
# Find row with the highest score for each 'id'
|
35 |
idx_highest_score = df.groupby('id')['score'].idxmax().values
|
|
|
41 |
return df_merged
|
42 |
|
43 |
|
44 |
+
|
45 |
+
|
46 |
+
def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
|
47 |
"""
|
48 |
+
Merges two dataframes, sorts them by 'date_utc', and filters out redundant IDs.
|
49 |
+
|
50 |
+
The function first concatenates the old and new dataframes. Then, it sorts the
|
51 |
+
resulting dataframe by the 'date_utc' column. Finally, it filters out redundant IDs
|
52 |
+
using the `filter_redundant_ids` function.
|
53 |
|
54 |
Args:
|
55 |
+
- old_df (pd.DataFrame): The original dataframe.
|
56 |
+
- new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.
|
57 |
|
58 |
Returns:
|
59 |
+
- pd.DataFrame: The merged, sorted, and filtered dataframe.
|
60 |
"""
|
|
|
61 |
|
62 |
+
# Concatenate old and new dataframes, sort by 'date_utc', and reset index
|
63 |
+
df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
|
64 |
+
|
65 |
+
# Filter out redundant IDs
|
66 |
df = filter_redundant_ids(df)
|
67 |
return df
|
68 |
|
|
|
70 |
if __name__ == '__main__':
|
71 |
# Mock data
|
72 |
data = {
|
73 |
+
'id': [1, 1, 2, 2, 3],
|
74 |
+
'content': ['short', 'much longer content', 'mid', 'size', 'constant'],
|
75 |
+
'score': [10, 5, 7, 9, 6],
|
76 |
+
'another_column': ['a', 'a', 'b', 'b', 'c']
|
77 |
}
|
78 |
|
79 |
df = pd.DataFrame(data)
|
utilities/praw_downloader.py
CHANGED
@@ -9,6 +9,22 @@ from utilities.my_logger import setup_logger
|
|
9 |
# Setup logging
|
10 |
logger = setup_logger(__name__)
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def get_reddit_instance() -> praw.Reddit:
|
14 |
"""Initialize and return a Reddit instance using PRAW."""
|
@@ -36,12 +52,12 @@ def extract_submission_data(submission: praw.models.Submission) -> Dict[str, Any
|
|
36 |
def praw_downloader() -> List[Dict[str, str]]:
|
37 |
"""Main function to extract and save all submissions from the subreddit."""
|
38 |
reddit = get_reddit_instance()
|
39 |
-
subreddit = reddit.subreddit(
|
40 |
|
41 |
-
logger.info('Starting to fetch submissions from
|
42 |
|
43 |
submissions = []
|
44 |
-
for submission in subreddit.new(limit=
|
45 |
# logger.debug(f'Processing post {submission.id} - {submission.title}')
|
46 |
data = extract_submission_data(submission)
|
47 |
submissions.append(data)
|
|
|
9 |
# Setup logging
|
10 |
logger = setup_logger(__name__)
|
11 |
|
12 |
+
# Get subreddit
|
13 |
+
subreddit_var = os.getenv("SUBREDDIT")
|
14 |
+
reddit_pull_limit = int(os.getenv("REDDIT_PULL_LIMIT"))
|
15 |
+
|
16 |
+
# Dummy row for when we create a new repo
|
17 |
+
dummy_data = {
|
18 |
+
"content": ["This is a sample post content. Just for demonstration purposes!"],
|
19 |
+
"poster": ["sampleUser123"],
|
20 |
+
"date_utc": [datetime.strptime("2023-10-26 14:30:45", '%Y-%m-%d %H:%M:%S')],
|
21 |
+
"flair": ["Discussion"],
|
22 |
+
"title": ["Sample Post Title: How to Use Hugging Face?"],
|
23 |
+
"score": [457],
|
24 |
+
"permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
|
25 |
+
"id": ['id']
|
26 |
+
}
|
27 |
+
|
28 |
|
29 |
def get_reddit_instance() -> praw.Reddit:
|
30 |
"""Initialize and return a Reddit instance using PRAW."""
|
|
|
52 |
def praw_downloader() -> List[Dict[str, str]]:
|
53 |
"""Main function to extract and save all submissions from the subreddit."""
|
54 |
reddit = get_reddit_instance()
|
55 |
+
subreddit = reddit.subreddit(subreddit_var)
|
56 |
|
57 |
+
logger.info(f'Starting to fetch submissions from {os.getenv("SUBREDDIT")}.')
|
58 |
|
59 |
submissions = []
|
60 |
+
for submission in subreddit.new(limit=reddit_pull_limit): # Set limit=None to get all posts
|
61 |
# logger.debug(f'Processing post {submission.id} - {submission.title}')
|
62 |
data = extract_submission_data(submission)
|
63 |
submissions.append(data)
|
utilities/readme_update.py
CHANGED
@@ -1,9 +1,13 @@
|
|
1 |
import os
|
|
|
2 |
|
|
|
3 |
from datasets.download.download_config import DownloadConfig
|
4 |
from datasets.utils.file_utils import cached_path
|
5 |
from datasets.utils.hub import hf_hub_url
|
6 |
|
|
|
|
|
7 |
|
8 |
def get_readme_path(dataset_name):
|
9 |
readme_path = hf_hub_url(dataset_name, "README.md")
|
@@ -12,27 +16,32 @@ def get_readme_path(dataset_name):
|
|
12 |
|
13 |
def update_readme(dataset_name, subreddit, latest_date, new_rows):
|
14 |
path = get_readme_path(dataset_name=dataset_name)
|
|
|
|
|
|
|
15 |
readme_text = f"""
|
16 |
## Dataset Overview
|
17 |
-
The goal is to have an open dataset of
|
18 |
|
19 |
-
There is a limit of 1000 in an API call and limited search functionality, so this is run
|
20 |
|
21 |
## Creation Details
|
22 |
-
|
23 |
|
24 |
## Update Frequency
|
25 |
-
The dataset is updated
|
26 |
|
27 |
## Licensing
|
28 |
[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
|
29 |
> The Content created with or submitted to our Services by Users (“User Content”) is owned by Users and not by Reddit. Subject to your complete and ongoing compliance with the Data API Terms, Reddit grants you a non-exclusive, non-transferable, non-sublicensable, and revocable license to copy and display the User Content using the Data API solely as necessary to develop, deploy, distribute, and run your App to your App Users. You may not modify the User Content except to format it for such display. You will comply with any requirements or restrictions imposed on usage of User Content by their respective owners, which may include "all rights reserved" notices, Creative Commons licenses, or other terms and conditions that may be agreed upon between you and the owners. Except as expressly permitted by this section, no other rights or licenses are granted or implied, including any right to use User Content for other purposes, such as for training a machine learning or AI model, without the express permission of rightsholders in the applicable User Content
|
30 |
|
31 |
My take is that you can't use this data for *training* without getting permission.
|
|
|
|
|
|
|
32 |
"""
|
33 |
|
34 |
append_readme(path=path, readme_text=readme_text)
|
35 |
-
return readme_text
|
36 |
|
37 |
|
38 |
def append_readme(path, readme_text):
|
|
|
1 |
import os
|
2 |
+
from datetime import datetime
|
3 |
|
4 |
+
import pytz
|
5 |
from datasets.download.download_config import DownloadConfig
|
6 |
from datasets.utils.file_utils import cached_path
|
7 |
from datasets.utils.hub import hf_hub_url
|
8 |
|
9 |
+
frequency = os.environ.get("FREQUENCY", '').lower()
|
10 |
+
|
11 |
|
12 |
def get_readme_path(dataset_name):
|
13 |
readme_path = hf_hub_url(dataset_name, "README.md")
|
|
|
16 |
|
17 |
def update_readme(dataset_name, subreddit, latest_date, new_rows):
|
18 |
path = get_readme_path(dataset_name=dataset_name)
|
19 |
+
latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
|
20 |
+
latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
|
21 |
+
|
22 |
readme_text = f"""
|
23 |
## Dataset Overview
|
24 |
+
The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. Im leveraging PRAW and the reddit API to get downloads.
|
25 |
|
26 |
+
There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.
|
27 |
|
28 |
## Creation Details
|
29 |
+
This dataset was created by [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-{subreddit})
|
30 |
|
31 |
## Update Frequency
|
32 |
+
The dataset is updated {frequency} with the most recent update being `{latest_hour_str}` where we added **{new_rows} new rows**.
|
33 |
|
34 |
## Licensing
|
35 |
[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
|
36 |
> The Content created with or submitted to our Services by Users (“User Content”) is owned by Users and not by Reddit. Subject to your complete and ongoing compliance with the Data API Terms, Reddit grants you a non-exclusive, non-transferable, non-sublicensable, and revocable license to copy and display the User Content using the Data API solely as necessary to develop, deploy, distribute, and run your App to your App Users. You may not modify the User Content except to format it for such display. You will comply with any requirements or restrictions imposed on usage of User Content by their respective owners, which may include "all rights reserved" notices, Creative Commons licenses, or other terms and conditions that may be agreed upon between you and the owners. Except as expressly permitted by this section, no other rights or licenses are granted or implied, including any right to use User Content for other purposes, such as for training a machine learning or AI model, without the express permission of rightsholders in the applicable User Content
|
37 |
|
38 |
My take is that you can't use this data for *training* without getting permission.
|
39 |
+
|
40 |
+
## Opt-out
|
41 |
+
To opt-out of this dataset please make a request in the community tab
|
42 |
"""
|
43 |
|
44 |
append_readme(path=path, readme_text=readme_text)
|
|
|
45 |
|
46 |
|
47 |
def append_readme(path, readme_text):
|