Spaces:
Runtime error
Runtime error
Commit
•
ba7deb1
1
Parent(s):
3d12d3a
Adding nomic
Browse files- app.py +8 -3
- src/build_nomic.py +38 -0
app.py
CHANGED
@@ -4,9 +4,10 @@ from pathlib import Path
|
|
4 |
import gradio as gr
|
5 |
from huggingface_hub import WebhookPayload, WebhooksServer
|
6 |
|
7 |
-
from src.utilities import load_datasets, merge_and_update_datasets
|
8 |
from src.my_logger import setup_logger
|
|
|
9 |
from src.visualize_logs import log_file_to_html_string
|
|
|
10 |
|
11 |
proj_dir = Path(__name__).parent
|
12 |
|
@@ -14,7 +15,7 @@ logger = setup_logger(__name__)
|
|
14 |
|
15 |
SUBREDDIT = os.environ["SUBREDDIT"]
|
16 |
USERNAME = os.environ["USERNAME"]
|
17 |
-
OG_DATASET= f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
|
18 |
PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
|
19 |
HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"]
|
20 |
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
|
@@ -50,10 +51,14 @@ async def community(payload: WebhookPayload):
|
|
50 |
dataset = merge_and_update_datasets(dataset, original_dataset)
|
51 |
|
52 |
# Push the augmented dataset to the Hugging Face hub
|
53 |
-
logger.
|
54 |
dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
|
55 |
logger.info(f"Pushed processed data to the Hugging Face Hub")
|
56 |
|
|
|
|
|
|
|
|
|
57 |
if __name__ == '__main__':
|
58 |
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|
59 |
# ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|
|
|
4 |
import gradio as gr
|
5 |
from huggingface_hub import WebhookPayload, WebhooksServer
|
6 |
|
|
|
7 |
from src.my_logger import setup_logger
|
8 |
+
from src.utilities import load_datasets, merge_and_update_datasets
|
9 |
from src.visualize_logs import log_file_to_html_string
|
10 |
+
from src.build_nomic import build_nomic
|
11 |
|
12 |
proj_dir = Path(__name__).parent
|
13 |
|
|
|
15 |
|
16 |
SUBREDDIT = os.environ["SUBREDDIT"]
|
17 |
USERNAME = os.environ["USERNAME"]
|
18 |
+
OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
|
19 |
PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
|
20 |
HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"]
|
21 |
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
|
|
|
51 |
dataset = merge_and_update_datasets(dataset, original_dataset)
|
52 |
|
53 |
# Push the augmented dataset to the Hugging Face hub
|
54 |
+
logger.info(f"Pushing processed data to the Hugging Face Hub...")
|
55 |
dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
|
56 |
logger.info(f"Pushed processed data to the Hugging Face Hub")
|
57 |
|
58 |
+
logger.info(f"Building Nomic...")
|
59 |
+
build_nomic(dataset=dataset)
|
60 |
+
logger.info(f"Built Nomic")
|
61 |
+
|
62 |
if __name__ == '__main__':
|
63 |
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|
64 |
# ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|
src/build_nomic.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
import nomic
|
6 |
+
from nomic import atlas
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
NOMIC_KEY = os.getenv('NOMIC_KEY')
|
10 |
+
nomic.login(NOMIC_KEY)
|
11 |
+
|
12 |
+
|
13 |
+
def build_nomic(dataset):
|
14 |
+
df = dataset['train'].to_pandas()
|
15 |
+
|
16 |
+
non_embedding_columns = ['date_utc', 'title', 'flair', 'content', 'poster', 'permalink', 'id', 'content_length',
|
17 |
+
'score', 'percentile_ranges']
|
18 |
+
|
19 |
+
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
|
20 |
+
percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
|
21 |
+
|
22 |
+
# Ensure the bins are unique and include the maximum score
|
23 |
+
bins = sorted(set(percentiles + [df['score'].max()]))
|
24 |
+
|
25 |
+
# Define the labels for the percentile ranges
|
26 |
+
# The number of labels should be one less than the number of bins
|
27 |
+
labels = [int(i * 10) for i in range(len(bins) - 1)]
|
28 |
+
|
29 |
+
# Add a 'percentile_ranges' column to the DataFrame
|
30 |
+
# This assigns each score to its corresponding percentile range
|
31 |
+
df['percentile_ranges'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
|
32 |
+
|
33 |
+
# Create Atlas project
|
34 |
+
project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
|
35 |
+
data=df[non_embedding_columns].to_dict(orient='records'),
|
36 |
+
id_field='id',
|
37 |
+
identifier='BORU Subreddit Neural Search',
|
38 |
+
)
|