File size: 3,770 Bytes
8e8a9fc 749d1d8 fceefe7 749d1d8 fceefe7 6a95e74 fceefe7 8e8a9fc fceefe7 f466394 8e8a9fc fceefe7 5d9e0b8 c1f39f8 fceefe7 749d1d8 5d9e0b8 fceefe7 8e8a9fc 24c9f40 8e8a9fc 749d1d8 5d9e0b8 3165ac3 5d9e0b8 9cd0b93 9a66c2f 5d9e0b8 075c34d f0e56b8 fceefe7 962f45f fceefe7 ec203a9 fceefe7 749d1d8 21acbc3 543cfd4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import os
from pathlib import Path
import gradio as gr
from huggingface_hub import WebhookPayload, WebhooksServer
from utilities.my_logger import setup_logger
from utilities.visualize_logs import log_file_to_html_string
proj_dir = Path(__name__).parent
SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
DATASET_NAME = f"{USERNAME}/reddit-{SUBREDDIT}"
FREQUENCY = os.environ.get("FREQUENCY", '').lower()
if FREQUENCY not in ["daily", "hourly"]:
raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'")
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
logger = setup_logger(__name__)
intro_md = f"""
# Reddit Dataset Creator
This is a reddit dataset creator which builds and updates [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME})
which pulls from [/r/{SUBREDDIT}](http://www.reddit.com/r/{SUBREDDIT}). Check the dataset for more details.
As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
"""
how_to_md = f"""
# How to make your own space and dataset
1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use'
- Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri'
- You need the `secret` and the `Client ID` from the reddit application.
- `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too.
2. Get your writable [huggingface token](https://huggingface.co/settings/tokens)
3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a>
and fill in the information
"""
how_does_it_work_md = f"""
# Core Components
There are 2 core components [main](main.py) and [app](app.py).
Main does a few things:
- Pulls from a datasource
- Updates a dataset on the hub
- Updates the README of the dataset
- Writes a local log file (inaccessible outside the spaces container)
App
- Visualizes the log file from Main
# Running it
This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the
log files. I use gradio for `app` and map that to the open port of huggingface spaces.
The only communication between `app` and `main` is the log file.
"""
js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'dark') {
url.searchParams.set('__theme', 'dark');
window.location.href = url.href;
}
}
"""
with gr.Blocks(js=js_func) as ui:
with gr.Tab("Application"):
gr.Markdown(intro_md)
gr.Image(str(proj_dir / 'media' / 'reddit_scraper.drawio.png'), type='filepath')
gr.Markdown("# Logs")
output = gr.HTML(log_file_to_html_string, every=1)
with gr.Tab("How to Create?"):
gr.Markdown(how_to_md)
with gr.Tab("How does it work?"):
gr.Markdown(how_does_it_work_md)
app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)
@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload):
if payload.event.scope.startswith("repo"):
logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")
if __name__ == '__main__':
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860, share=False)
# ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860, share=False)
|