derek-thomas HF staff commited on
Commit
ba7deb1
1 Parent(s): 3d12d3a

Adding nomic

Browse files
Files changed (2) hide show
  1. app.py +8 -3
  2. src/build_nomic.py +38 -0
app.py CHANGED
@@ -4,9 +4,10 @@ from pathlib import Path
4
  import gradio as gr
5
  from huggingface_hub import WebhookPayload, WebhooksServer
6
 
7
- from src.utilities import load_datasets, merge_and_update_datasets
8
  from src.my_logger import setup_logger
 
9
  from src.visualize_logs import log_file_to_html_string
 
10
 
11
  proj_dir = Path(__name__).parent
12
 
@@ -14,7 +15,7 @@ logger = setup_logger(__name__)
14
 
15
  SUBREDDIT = os.environ["SUBREDDIT"]
16
  USERNAME = os.environ["USERNAME"]
17
- OG_DATASET= f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
18
  PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
19
  HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"]
20
  WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
@@ -50,10 +51,14 @@ async def community(payload: WebhookPayload):
50
  dataset = merge_and_update_datasets(dataset, original_dataset)
51
 
52
  # Push the augmented dataset to the Hugging Face hub
53
- logger.debug(f"Pushing processed data to the Hugging Face Hub...")
54
  dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
55
  logger.info(f"Pushed processed data to the Hugging Face Hub")
56
 
 
 
 
 
57
  if __name__ == '__main__':
58
  app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
59
  # ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
 
4
  import gradio as gr
5
  from huggingface_hub import WebhookPayload, WebhooksServer
6
 
 
7
  from src.my_logger import setup_logger
8
+ from src.utilities import load_datasets, merge_and_update_datasets
9
  from src.visualize_logs import log_file_to_html_string
10
+ from src.build_nomic import build_nomic
11
 
12
  proj_dir = Path(__name__).parent
13
 
 
15
 
16
  SUBREDDIT = os.environ["SUBREDDIT"]
17
  USERNAME = os.environ["USERNAME"]
18
+ OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
19
  PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
20
  HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"]
21
  WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
 
51
  dataset = merge_and_update_datasets(dataset, original_dataset)
52
 
53
  # Push the augmented dataset to the Hugging Face hub
54
+ logger.info(f"Pushing processed data to the Hugging Face Hub...")
55
  dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
56
  logger.info(f"Pushed processed data to the Hugging Face Hub")
57
 
58
+ logger.info(f"Building Nomic...")
59
+ build_nomic(dataset=dataset)
60
+ logger.info(f"Built Nomic")
61
+
62
  if __name__ == '__main__':
63
  app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
64
  # ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
src/build_nomic.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
2
+ import os
3
+ import pandas as pd
4
+
5
+ import nomic
6
+ from nomic import atlas
7
+ import numpy as np
8
+
9
+ NOMIC_KEY = os.getenv('NOMIC_KEY')
10
+ nomic.login(NOMIC_KEY)
11
+
12
+
13
+ def build_nomic(dataset):
14
+ df = dataset['train'].to_pandas()
15
+
16
+ non_embedding_columns = ['date_utc', 'title', 'flair', 'content', 'poster', 'permalink', 'id', 'content_length',
17
+ 'score', 'percentile_ranges']
18
+
19
+ # Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
20
+ percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
21
+
22
+ # Ensure the bins are unique and include the maximum score
23
+ bins = sorted(set(percentiles + [df['score'].max()]))
24
+
25
+ # Define the labels for the percentile ranges
26
+ # The number of labels should be one less than the number of bins
27
+ labels = [int(i * 10) for i in range(len(bins) - 1)]
28
+
29
+ # Add a 'percentile_ranges' column to the DataFrame
30
+ # This assigns each score to its corresponding percentile range
31
+ df['percentile_ranges'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
32
+
33
+ # Create Atlas project
34
+ project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
35
+ data=df[non_embedding_columns].to_dict(orient='records'),
36
+ id_field='id',
37
+ identifier='BORU Subreddit Neural Search',
38
+ )