File size: 3,770 Bytes
8e8a9fc
 
 
749d1d8
fceefe7
749d1d8
fceefe7
6a95e74
fceefe7
8e8a9fc
 
fceefe7
 
f466394
8e8a9fc
fceefe7
 
5d9e0b8
 
c1f39f8
fceefe7
 
 
749d1d8
5d9e0b8
 
fceefe7
 
8e8a9fc
24c9f40
8e8a9fc
749d1d8
5d9e0b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3165ac3
 
 
 
 
 
 
 
 
 
 
 
 
5d9e0b8
 
9cd0b93
9a66c2f
 
5d9e0b8
 
 
 
075c34d
f0e56b8
fceefe7
 
962f45f
fceefe7
 
ec203a9
fceefe7
749d1d8
 
21acbc3
543cfd4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
from pathlib import Path

import gradio as gr
from huggingface_hub import WebhookPayload, WebhooksServer

from utilities.my_logger import setup_logger
from utilities.visualize_logs import log_file_to_html_string

proj_dir = Path(__name__).parent

SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
DATASET_NAME = f"{USERNAME}/reddit-{SUBREDDIT}"

FREQUENCY = os.environ.get("FREQUENCY", '').lower()
if FREQUENCY not in ["daily", "hourly"]:
    raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'")

WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')

logger = setup_logger(__name__)


intro_md = f"""
# Reddit Dataset Creator
This is a reddit dataset creator which builds and updates [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}) 
which pulls from [/r/{SUBREDDIT}](http://www.reddit.com/r/{SUBREDDIT}). Check the dataset for more details. 

As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
"""

how_to_md = f"""
# How to make your own space and dataset
1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use'
    - Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri'
    - You need the `secret` and the `Client ID` from the reddit application.
    - `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too.
2. Get your writable [huggingface token](https://huggingface.co/settings/tokens)
3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a>
and fill in the information
"""

how_does_it_work_md = f"""
# Core Components
There are 2 core components [main](main.py) and [app](app.py).
Main does a few things: 
- Pulls from a datasource 
- Updates a dataset on the hub
- Updates the README of the dataset
- Writes a local log file (inaccessible outside the spaces container)

App
- Visualizes the log file from Main

# Running it
This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the
log files. I use gradio for `app` and map that to the open port of huggingface spaces. 

The only communication between `app` and `main` is the log file.
"""

js_func = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'dark') {
        url.searchParams.set('__theme', 'dark');
        window.location.href = url.href;
    }
}
"""


with gr.Blocks(js=js_func) as ui:
    with gr.Tab("Application"):
        gr.Markdown(intro_md)
        gr.Image(str(proj_dir / 'media' / 'reddit_scraper.drawio.png'), type='filepath')
        gr.Markdown("# Logs")
        output = gr.HTML(log_file_to_html_string, every=1)
    with gr.Tab("How to Create?"):
        gr.Markdown(how_to_md)
    with gr.Tab("How does it work?"):
        gr.Markdown(how_does_it_work_md)

app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)


@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload):
    if payload.event.scope.startswith("repo"):
        logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")


if __name__ == '__main__':
    app.launch(server_name="0.0.0.0", show_error=True, server_port=7860, share=False)
    # ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860, share=False)