Spaces:
Sleeping
Sleeping
File size: 6,710 Bytes
c8c8070 0773644 1d82bda 22abe75 1d82bda 40eb9ab 4daf261 40eb9ab 1baaff6 40eb9ab 4daf261 40eb9ab 4daf261 40eb9ab 4daf261 1d82bda 1baaff6 ee7be58 a465593 ee7be58 1baaff6 4e3290f 1baaff6 1d82bda c8c8070 1d82bda 0251c9e 1d82bda 7d315d3 1d82bda 4e3290f 0773644 1d82bda 34ab564 0773644 4daf261 c8c8070 4daf261 56d3094 4e3290f 0773644 963e057 ce0d1e8 9de8daf 4e3290f 9de8daf 4e3290f 0506a5f 9de8daf 66ce76b 9de8daf 11b4e34 4e3290f a1d40d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import gradio as gr
import json
import re
import torch
import feedparser
import time
from urllib.parse import urlparse, parse_qs
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = 'yuntian-deng/ak-paper-selection-deberta'
max_length = 512
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()
if torch.cuda.is_available():
model.cuda()
validation_results = json.load(open('validation_results.json'))
scores, thresholds, precisions, recalls = validation_results['scores'], validation_results['thresholds'], validation_results['precisions'], validation_results['recalls']
def get_threshold_precision(score_):
for score, threshold, precision, recall in zip(scores, thresholds, precisions, recalls):
if score_ < score:
break
prev_score, prev_threshold, prev_precision, prev_recall = score, threshold, precision, recall
if prev_threshold == prev_score:
prev_threshold = score_
return prev_threshold, prev_precision, prev_recall
def extract_arxiv_id(input_text):
if 'arxiv.org' in input_text:
parsed_url = urlparse(input_text)
query = parse_qs(parsed_url.query)
path_parts = parsed_url.path.split('/')
if 'id_list' in query:
return query['id_list'][0]
elif path_parts[-2] in ['abs', 'pdf']:
return path_parts[-1].replace('.pdf', '')
return input_text
def fetch_arxiv_data(arxiv_id):
time.sleep(3) # Comply with arXiv API terms of usage
query_url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'
response = feedparser.parse(query_url)
if response.entries:
entry = response.entries[0]
title = entry.title
authors = ', '.join(author.name for author in entry.authors)
abstract = entry.summary
return title, authors, abstract
return "", "", ""
def update_fields(url_or_id):
arxiv_id = extract_arxiv_id(url_or_id)
if len(arxiv_id.strip().split('.')) != 2:
return '', '', '', ''
print (arxiv_id)
title, authors, abstract = fetch_arxiv_data(arxiv_id)
output = predict(title, authors, abstract)
return title, authors, abstract, output
def normalize_spaces(text):
return re.sub(r'\s+', ' ', text).strip()
def fill_template(title, authors, abstract):
title = normalize_spaces(title.replace('\n', ' '))
authors = ', '.join([author.strip() for author in authors.split(',')])
abstract = normalize_spaces(abstract.replace('\n', ' '))
text = f"""Title: {title}
Authors: {authors}
Abstract: {abstract}"""
return text
@torch.no_grad()
def model_inference(title, authors, abstract):
text = fill_template(title, authors, abstract)
text = f'[CLS] {text} [SEP]'
print (text)
inputs = tokenizer([text], return_tensors="pt", truncation=True, max_length=max_length)
if torch.cuda.is_available():
inputs = {key: value.cuda() for key, value in inputs.items()}
outputs = model(**inputs)
logits = outputs.logits
probs = logits.softmax(dim=-1).view(-1)
score = probs[1].item()
return score
def predict(title, authors, abstract):
# Your model prediction logic here
score = model_inference(title, authors, abstract)
# Calculate precision for scores >= the predicted score
#selected = [d for d in validation_data if d['score'] >= score]
#true_positives = sum(1 for d in selected if d['label'] == 1)
#precision = true_positives / len(selected) if selected else 0
threshold, precision, recall = get_threshold_precision(score)
result = f"Your score: {score:.2f}.\nFor papers with score>={threshold:.2f}, {precision * 100:.2f}% are selected by AK.\nFor papers selected by AK, {recall * 100:.2f}% have score>={threshold:.2f}"
return result
example_title = "WildChat: 1M ChatGPT Interaction Logs in the Wild"
example_authors = "Wenting Zhao, Xiang Ren, Jack Hessel, Claire Cardie, Yejin Choi, Yuntian Deng"
example_abstract = "Chatbots such as GPT-4 and ChatGPT are now serving millions of users. Despite their widespread use, there remains a lack of public datasets showcasing how these tools are used by a population of users in practice. To bridge this gap, we offered free access to ChatGPT for online users in exchange for their affirmative, consensual opt-in to anonymously collect their chat transcripts and request headers. From this, we compiled WildChat, a corpus of 1 million user-ChatGPT conversations, which consists of over 2.5 million interaction turns. We compare WildChat with other popular user-chatbot interaction datasets, and find that our dataset offers the most diverse user prompts, contains the largest number of languages, and presents the richest variety of potentially toxic use-cases for researchers to study. In addition to timestamped chat transcripts, we enrich the dataset with demographic data, including state, country, and hashed IP addresses, alongside request headers. This augmentation allows for more detailed analysis of user behaviors across different geographical regions and temporal dimensions. Finally, because it captures a broad range of use cases, we demonstrate the dataset’s potential utility in fine-tuning instruction-following models. WildChat is released at https://wildchat.allen.ai under AI2 ImpACT Licenses."
with gr.Blocks() as demo:
title_box = gr.Textbox(label="Paper Title", placeholder="Enter paper title", value=example_title)
author_box = gr.Textbox(label="Authors (separated by comma)", placeholder="Enter authors (separated by comma)", value=example_authors)
abstract_box = gr.TextArea(label="Abstract", placeholder="Enter abstract", value=example_abstract)
output_box = gr.Textbox(label="Predicted Selection Probability")
iface = gr.Interface(
fn=predict,
inputs=[title_box, author_box, abstract_box],
outputs=[output_box],
clear_btn=gr.Button("Clear", variant="secondary", visible=False),
title="Paper Selection Prediction",
description="Predict if @_akhaliq will select your paper into Hugging Face papers. Enter the title, authors, and abstract of your paper, or enter an arXiv URL/ID.",
live=False,
concurrency_limit=1
)
arxiv_box = gr.Textbox(label="[Optional] Autofill using arXiv URL/ID", placeholder="[Optional] Autofill using arXiv URL/ID")
autofill_btn = gr.Button("Predict using arXiv URL/ID", variant="secondary")
autofill_btn.click(update_fields, inputs=[arxiv_box], outputs=[title_box, author_box, abstract_box, output_box], concurrency_limit=1)
demo.queue(max_size=20).launch()
|