|
import os |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
from huggingface_hub import WebhookPayload, WebhooksServer |
|
|
|
from src.my_logger import setup_logger |
|
from src.utilities import load_datasets, merge_and_update_datasets |
|
from src.visualize_logs import log_file_to_html_string |
|
from src.build_nomic import build_nomic |
|
|
|
proj_dir = Path(__name__).parent |
|
|
|
logger = setup_logger(__name__) |
|
|
|
SUBREDDIT = os.environ["SUBREDDIT"] |
|
USERNAME = os.environ["USERNAME"] |
|
OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}" |
|
PROCESSED_DATASET = os.environ['PROCESSED_DATASET'] |
|
HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"] |
|
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret') |
|
|
|
intro_md = """ |
|
# Processing BORU |
|
This space is triggered by a webhook for changes on |
|
[derek-thomas/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-bestofredditorupdates). |
|
It then takes the updates from that dataset and get embeddings and puts the results in |
|
[https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed) |
|
""" |
|
|
|
html_str = """ |
|
<html> |
|
|
|
<head> |
|
<title>conll2003</title> |
|
<style> |
|
body { |
|
font-family: Arial, sans-serif; |
|
background-color: #f0f0f0; |
|
display: flex; |
|
justify-content: center; |
|
align-items: center; |
|
height: 100vh; |
|
margin: 0; |
|
padding: 0; |
|
color: #333; |
|
} |
|
.iframe-container { |
|
border: 1px solid #ccc; |
|
border-radius: 10px; |
|
overflow: hidden; |
|
width: 80%; |
|
height: 80%; |
|
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); |
|
} |
|
iframe { |
|
width: 100%; |
|
height: 100%; |
|
border: none; |
|
} |
|
</style> |
|
</head> |
|
|
|
<body> |
|
<div class="iframe-container"> |
|
<iframe src="https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map/cdd8c890-2fac-4ea6-91f8-e6821203cfcb" allow="clipboard-read; clipboard-write" |
|
title="Nomic Atlas"></iframe> |
|
</div> |
|
</body> |
|
|
|
</html>""" |
|
|
|
with gr.Blocks() as ui: |
|
with gr.Tab("Application"): |
|
gr.Markdown(intro_md) |
|
gr.HTML(html_str) |
|
with gr.Tab("Logs"): |
|
gr.Markdown("# Logs") |
|
output = gr.HTML(log_file_to_html_string, every=1) |
|
|
|
app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET) |
|
|
|
|
|
@app.add_webhook("/dataset_repo") |
|
async def community(payload: WebhookPayload): |
|
if payload.event.scope.startswith("repo"): |
|
logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}") |
|
else: |
|
return |
|
|
|
logger.info(f"Loading new dataset...") |
|
dataset, original_dataset = load_datasets() |
|
logger.info(f"Loaded new dataset") |
|
|
|
logger.info(f"Merging and Updating row...") |
|
dataset = merge_and_update_datasets(dataset, original_dataset) |
|
|
|
|
|
logger.info(f"Pushing processed data to the Hugging Face Hub...") |
|
dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN) |
|
logger.info(f"Pushed processed data to the Hugging Face Hub") |
|
|
|
logger.info(f"Building Nomic...") |
|
build_nomic(dataset=dataset) |
|
logger.info(f"Built Nomic") |
|
|
|
if __name__ == '__main__': |
|
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860) |
|
|
|
|