Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import re | |
import copy | |
import datasets | |
import pandas as pd | |
import gradio as gr | |
from collections import defaultdict | |
from datetime import datetime, timedelta | |
from datasets import Dataset | |
from huggingface_hub import HfApi | |
from huggingface_hub import create_repo | |
from huggingface_hub.utils import HfHubHTTPError | |
import utils | |
from paper.download import ( | |
download_pdf_from_arxiv, | |
get_papers_from_hf_daily_papers, | |
get_papers_from_arxiv_ids | |
) | |
from paper.parser import extract_text_and_figures | |
from gen.gemini import get_basic_qa, get_deep_qa | |
from constants.styles import STYLE | |
from constants.js import UPDATE_SEARCH_RESULTS, UPDATE_IF_TYPE | |
from apscheduler.schedulers.background import BackgroundScheduler | |
def count_nans(row): | |
count = 0 | |
for _, (k, v) in enumerate(data.items()): | |
if v is None: | |
count = count + 1 | |
return count | |
gemini_api_key = os.getenv("GEMINI_API_KEY") | |
hf_token = os.getenv("HF_TOKEN") | |
dataset_repo_id = os.getenv("SOURCE_DATA_REPO_ID") # "chansung/auto-paper-qa2" | |
request_arxiv_repo_id = os.getenv("REQUEST_DATA_REPO_ID") # "chansung/requested-arxiv-ids-3" | |
ds = datasets.load_dataset(dataset_repo_id) | |
request_ds = datasets.load_dataset(request_arxiv_repo_id) | |
requested_arxiv_ids = [] | |
for request_d in request_ds['train']: | |
arxiv_ids = request_d['Requested arXiv IDs'] | |
requested_arxiv_ids = requested_arxiv_ids + arxiv_ids | |
requested_arxiv_ids_df = pd.DataFrame({'Requested arXiv IDs': requested_arxiv_ids}) | |
title2qna = {} | |
date2qna = {} | |
date_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) | |
for data in ds["train"]: | |
date = data["target_date"].strftime("%Y-%m-%d") | |
if date in date2qna: | |
papers = copy.deepcopy(date2qna[date]) | |
for paper in papers: | |
if paper["title"] == data["title"]: | |
if count_nans(paper) > count_nans(data): | |
date2qna[date].remove(paper) | |
date2qna[date].append(data) | |
del papers | |
else: | |
date2qna[date] = [data] | |
for date in date2qna: | |
year, month, day = date.split("-") | |
papers = date2qna[date] | |
for paper in papers: | |
title2qna[paper["title"]] = paper | |
date_dict[year][month][day].append(paper) | |
titles = title2qna.keys() | |
sorted_dates = sorted(date2qna.keys()) | |
sorted_year = sorted(date_dict.keys()) | |
last_year = sorted_year[-1] | |
sorted_month = sorted(date_dict[last_year].keys()) | |
last_month = sorted_month[-1] | |
sorted_day = sorted(date_dict[last_year][last_month].keys()) | |
last_day = sorted_day[-1] | |
last_papers = date_dict[last_year][last_month][last_day] | |
selected_paper = last_papers[0] | |
def filter_function(example, ids): | |
ids_e = example['Requested arXiv IDs'] | |
for iid in ids: | |
if iid in ids_e: | |
ids_e.remove(iid) | |
example['Requested arXiv IDs'] = ids_e | |
print(example) | |
return example | |
def process_arxiv_ids(gemini_api, hf_repo_id, req_hf_repo_id, hf_token, how_many=10): | |
arxiv_ids = [] | |
ds1 = datasets.load_dataset(req_hf_repo_id) | |
for d in ds1['train']: | |
req_arxiv_ids = d['Requested arXiv IDs'] | |
if len(req_arxiv_ids) > 0 and req_arxiv_ids[0] != "top": | |
arxiv_ids = arxiv_ids + req_arxiv_ids | |
arxiv_ids = arxiv_ids[:how_many] | |
if arxiv_ids is not None and len(arxiv_ids) > 0: | |
print(f"1. Get metadata for the papers [{arxiv_ids}]") | |
papers = get_papers_from_arxiv_ids(arxiv_ids) | |
print("...DONE") | |
print("2. Generating QAs for the paper") | |
for paper in papers: | |
try: | |
title = paper['title'] | |
target_date = paper['target_date'] | |
abstract = paper['paper']['summary'] | |
arxiv_id = paper['paper']['id'] | |
authors = paper['paper']['authors'] | |
print(f"...PROCESSING ON[{arxiv_id}, {title}]") | |
print(f"......Downloading the paper PDF") | |
filename = download_pdf_from_arxiv(arxiv_id) | |
print(f"......DONE") | |
print(f"......Extracting text and figures") | |
texts, figures = extract_text_and_figures(filename) | |
text =' '.join(texts) | |
print(f"......DONE") | |
print(f"......Generating the seed(basic) QAs") | |
qnas = get_basic_qa(text, gemini_api_key=gemini_api, trucate=30000) | |
qnas['title'] = title | |
qnas['abstract'] = abstract | |
qnas['authors'] = ','.join(authors) | |
qnas['arxiv_id'] = arxiv_id | |
qnas['target_date'] = target_date | |
qnas['full_text'] = text | |
print(f"......DONE") | |
print(f"......Generating the follow-up QAs") | |
qnas = get_deep_qa(text, qnas, gemini_api_key=gemini_api, trucate=30000) | |
del qnas["qna"] | |
print(f"......DONE") | |
print(f"......Exporting to HF Dataset repo at [{hf_repo_id}]") | |
utils.push_to_hf_hub(qnas, hf_repo_id, hf_token) | |
print(f"......DONE") | |
print(f"......Updating request arXiv HF Dataset repo at [{req_hf_repo_id}]") | |
ds1 = ds1['train'].map( | |
lambda example: filter_function(example, [arxiv_id]) | |
).filter( | |
lambda example: len(example['Requested arXiv IDs']) > 0 | |
) | |
ds1.push_to_hub(req_hf_repo_id, token=hf_token) | |
print(f"......DONE") | |
except Exception as e: | |
print(f".......failed due to exception {e}") | |
continue | |
HfApi(token=hf_token).restart_space( | |
repo_id="chansung/paper_qa", token=hf_token | |
) | |
def push_to_hf_hub( | |
df, repo_id, token, append=True | |
): | |
exist = False | |
ds = Dataset.from_pandas(df) | |
try: | |
create_repo(request_arxiv_repo_id, repo_type="dataset", token=hf_token) | |
except HfHubHTTPError as e: | |
exist = True | |
if exist and append: | |
existing_ds = datasets.load_dataset(repo_id) | |
ds = datasets.concatenate_datasets([existing_ds['train'], ds]) | |
ds.push_to_hub(repo_id, token=token) | |
def _filter_duplicate_arxiv_ids(arxiv_ids_to_be_added): | |
ds1 = datasets.load_dataset("chansung/requested-arxiv-ids-3") | |
ds2 = datasets.load_dataset("chansung/auto-paper-qa2") | |
unique_arxiv_ids = set() | |
for d in ds1['train']: | |
arxiv_ids = d['Requested arXiv IDs'] | |
unique_arxiv_ids = set(list(unique_arxiv_ids) + arxiv_ids) | |
for d in ds2['train']: | |
arxiv_id = d['arxiv_id'] | |
unique_arxiv_ids.add(arxiv_id) | |
return list(set(arxiv_ids_to_be_added) - unique_arxiv_ids) | |
def _is_arxiv_id_valid(arxiv_id): | |
pattern = r"^\d{4}\.\d{5}$" | |
return bool(re.match(pattern, arxiv_id)) | |
def _get_valid_arxiv_ids(arxiv_ids_str): | |
valid_arxiv_ids = [] | |
invalid_arxiv_ids = [] | |
for arxiv_id in arxiv_ids_str.split(","): | |
arxiv_id = arxiv_id.strip() | |
if _is_arxiv_id_valid(arxiv_id): | |
valid_arxiv_ids.append(arxiv_id) | |
else: | |
invalid_arxiv_ids.append(arxiv_id) | |
return valid_arxiv_ids, invalid_arxiv_ids | |
def add_arxiv_ids_to_queue(queue, arxiv_ids_str): | |
print(0) | |
valid_arxiv_ids, invalid_arxiv_ids = _get_valid_arxiv_ids(arxiv_ids_str) | |
print("01") | |
if len(invalid_arxiv_ids) > 0: | |
gr.Warning(f"found invalid arXiv ids as in {invalid_arxiv_ids}") | |
if len(valid_arxiv_ids) > 0: | |
valid_arxiv_ids = _filter_duplicate_arxiv_ids(valid_arxiv_ids) | |
if len(valid_arxiv_ids) > 0: | |
valid_arxiv_ids = [[arxiv_id] for arxiv_id in valid_arxiv_ids] | |
gr.Warning(f"Processing on [{valid_arxiv_ids}]. Other requested arXiv IDs not found on this list should be already processed or being processed...") | |
valid_arxiv_ids = pd.DataFrame({'Requested arXiv IDs': valid_arxiv_ids}) | |
queue = pd.concat([queue, valid_arxiv_ids]) | |
queue.reset_index(drop=True) | |
push_to_hf_hub(valid_arxiv_ids, request_arxiv_repo_id, hf_token) | |
else: | |
gr.Warning(f"All requested arXiv IDs are already processed or being processed...") | |
else: | |
gr.Warning(f"No valid arXiv IDs found...") | |
return ( | |
queue, gr.Textbox("") | |
) | |
def get_paper_by_year(y): | |
m = sorted(date_dict[y].keys()) | |
last_m = m[-1] | |
d = sorted(date_dict[y][last_m].keys()) | |
last_d = d[-1] | |
papers = [paper["title"] for paper in date_dict[y][last_m][last_d]] | |
papers = list(set(papers)) | |
return ( | |
gr.Dropdown(choices=m, value=last_m), | |
gr.Dropdown(choices=d, value=last_d), | |
gr.Dropdown(choices=papers, value=papers[0]) | |
) | |
def get_paper_by_month(y, m): | |
d = sorted(date_dict[y][m].keys()) | |
last_d = d[-1] | |
papers = [paper["title"] for paper in date_dict[y][m][last_d]] | |
papers = list(set(papers)) | |
return ( | |
gr.Dropdown(choices=d, value=last_d), | |
gr.Dropdown(choices=papers, value=papers[0]) | |
) | |
def get_paper_by_day(y, m, d): | |
papers = [paper["title"] for paper in date_dict[y][m][d]] | |
papers = list(set(papers)) | |
return gr.Dropdown(choices=papers, value=papers[0]) | |
def set_paper(y, m, d, paper_title): | |
selected_paper = None | |
for paper in date_dict[y][m][d]: | |
if paper["title"] == paper_title: | |
selected_paper = paper | |
break | |
return ( | |
gr.Markdown(f"# {selected_paper['title']}"), | |
gr.Markdown( | |
"[![arXiv](https://img.shields.io/badge/arXiv-%s-b31b1b.svg)](https://arxiv.org/abs/%s)" % (selected_paper['arxiv_id'], selected_paper['arxiv_id']) | |
), | |
gr.Markdown( | |
"[![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md.svg)](https://huggingface.co/papers/%s)" % selected_paper['arxiv_id'] | |
), | |
gr.Markdown(selected_paper["summary"]), | |
gr.Markdown(f"### π {selected_paper['0_question']}"), | |
gr.Markdown(f"βͺ **(ELI5)** {selected_paper['0_answers:eli5']}"), | |
gr.Markdown(f"βͺ **(Technical)** {selected_paper['0_answers:expert']}"), | |
gr.Markdown(f"### ππ {selected_paper['0_additional_depth_q:follow up question']}"), | |
gr.Markdown(f"βͺ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}"), | |
gr.Markdown(f"βͺ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}"), | |
gr.Markdown(f"### ππ {selected_paper['0_additional_breath_q:follow up question']}"), | |
gr.Markdown(f"βͺ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}"), | |
gr.Markdown(f"βͺ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}"), | |
gr.Markdown(f"### π {selected_paper['1_question']}"), | |
gr.Markdown(f"βͺ **(ELI5)** {selected_paper['1_answers:eli5']}"), | |
gr.Markdown(f"βͺ **(Technical)** {selected_paper['1_answers:expert']}"), | |
gr.Markdown(f"### ππ {selected_paper['1_additional_depth_q:follow up question']}"), | |
gr.Markdown(f"βͺ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}"), | |
gr.Markdown(f"βͺ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}"), | |
gr.Markdown(f"### ππ {selected_paper['1_additional_breath_q:follow up question']}"), | |
gr.Markdown(f"βͺ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}"), | |
gr.Markdown(f"βͺ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}"), | |
gr.Markdown(f"### π {selected_paper['2_question']}"), | |
gr.Markdown(f"βͺ **(ELI5)** {selected_paper['2_answers:eli5']}"), | |
gr.Markdown(f"βͺ **(Technical)** {selected_paper['2_answers:expert']}"), | |
gr.Markdown(f"### ππ {selected_paper['2_additional_depth_q:follow up question']}"), | |
gr.Markdown(f"βͺ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}"), | |
gr.Markdown(f"βͺ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}"), | |
gr.Markdown(f"### ππ {selected_paper['2_additional_breath_q:follow up question']}"), | |
gr.Markdown(f"βͺ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}"), | |
gr.Markdown(f"βͺ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}"), | |
) | |
def change_exp_type(exp_type): | |
if exp_type == "ELI5": | |
return ( | |
gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), | |
gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), | |
gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), | |
) | |
else: | |
return ( | |
gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), | |
gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), | |
gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), | |
) | |
def search(search_in, max_results=3): | |
results = [] | |
for title in titles: | |
if len(results) > 3: | |
break | |
else: | |
if search_in in title: | |
results.append(title) | |
return ( | |
gr.Textbox( | |
visible=True if len(results) > 0 else False, | |
value=results[0] if len(results) > 0 else "" | |
), | |
gr.Textbox( | |
visible=True if len(results) > 1 else False, | |
value=results[1] if len(results) > 1 else "" | |
), | |
gr.Textbox( | |
visible=True if len(results) > 2 else False, | |
value=results[2] if len(results) > 2 else "" | |
) | |
) | |
def set_date(title): | |
for _, (year, months) in enumerate(date_dict.items()): | |
for _, (month, days) in enumerate(months.items()): | |
for _, (day, papers) in enumerate(days.items()): | |
for paper in papers: | |
if paper['title'] == title: | |
return ( | |
gr.Dropdown(value=year), | |
gr.Dropdown(choices=sorted(months), value=month), | |
gr.Dropdown(choices=sorted(days), value=day), | |
) | |
def set_papers(y, m, d, title): | |
papers = [paper["title"] for paper in date_dict[y][m][d]] | |
papers = list(set(papers)) | |
return ( | |
gr.Dropdown(choices=papers, value=title), | |
gr.Textbox("") | |
) | |
with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# Let's explore papers with auto generated Q&As") | |
with gr.Column(elem_id="control-panel", elem_classes=["group"]): | |
with gr.Column(): | |
with gr.Row(): | |
year_dd = gr.Dropdown(sorted_year, value=last_year, label="Year", interactive=True, filterable=False) | |
month_dd = gr.Dropdown(sorted_month, value=last_month, label="Month", interactive=True, filterable=False) | |
day_dd = gr.Dropdown(sorted_day, value=last_day, label="Day", interactive=True, filterable=False) | |
papers_dd = gr.Dropdown( | |
list(set([paper["title"] for paper in last_papers])), | |
value=selected_paper["title"], | |
label="Select paper title", | |
interactive=True, | |
filterable=False | |
) | |
with gr.Column(elem_classes=["no-gap"]): | |
search_in = gr.Textbox("", placeholder="Enter keywords to search...", elem_classes=["textbox-no-label"]) | |
search_r1 = gr.Button(visible=False, elem_id="search_r1", elem_classes=["no-radius"]) | |
search_r2 = gr.Button(visible=False, elem_id="search_r2", elem_classes=["no-radius"]) | |
search_r3 = gr.Button(visible=False, elem_id="search_r3", elem_classes=["no-radius"]) | |
search_r4 = gr.Button(visible=False, elem_id="search_r4", elem_classes=["no-radius"]) | |
search_r5 = gr.Button(visible=False, elem_id="search_r5", elem_classes=["no-radius"]) | |
search_r6 = gr.Button(visible=False, elem_id="search_r6", elem_classes=["no-radius"]) | |
search_r7 = gr.Button(visible=False, elem_id="search_r7", elem_classes=["no-radius"]) | |
search_r8 = gr.Button(visible=False, elem_id="search_r8", elem_classes=["no-radius"]) | |
search_r9 = gr.Button(visible=False, elem_id="search_r9", elem_classes=["no-radius"]) | |
search_r10 = gr.Button(visible=False, elem_id="search_r10", elem_classes=["no-radius"]) | |
conv_type = gr.Radio(choices=["Q&As", "Chat"], value="Q&As", interactive=True, visible=False, elem_classes=["conv-type"]) | |
with gr.Column(scale=7): | |
title = gr.Markdown(f"# {selected_paper['title']}") | |
# with gr.Row(): | |
with gr.Row(): | |
arxiv_link = gr.Markdown( | |
"[![arXiv](https://img.shields.io/badge/arXiv-%s-b31b1b.svg)](https://arxiv.org/abs/%s)" % (selected_paper['arxiv_id'], selected_paper['arxiv_id']) | |
) | |
hf_paper_link = gr.Markdown( | |
"[![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md.svg)](https://huggingface.co/papers/%s)" % selected_paper['arxiv_id'] | |
) | |
gr.Button("Chat about the paper", interactive=False) | |
summary = gr.Markdown(f"{selected_paper['summary']}", elem_classes=["small-font"]) | |
with gr.Column(elem_id="chat_block", visible=False): | |
gr.Chatbot([("hello", "world"), ("how", "are you?")]) | |
with gr.Column(elem_id="qna_block", visible=True): | |
with gr.Row(): | |
with gr.Column(scale=7): | |
gr.Markdown("## Auto generated Questions & Answers") | |
exp_type = gr.Radio(choices=["ELI5", "Technical"], value="ELI5", elem_classes=["exp-type"], scale=3) | |
# 1 | |
with gr.Column(elem_classes=["group"], visible=True) as q_0: | |
basic_q_0 = gr.Markdown(f"### π {selected_paper['0_question']}") | |
basic_q_eli5_0 = gr.Markdown(f"βͺ **(ELI5)** {selected_paper['0_answers:eli5']}", elem_classes=["small-font"]) | |
basic_q_expert_0 = gr.Markdown(f"βͺ **(Technical)** {selected_paper['0_answers:expert']}", visible=False, elem_classes=["small-font"]) | |
with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_0_0: | |
depth_q_0 = gr.Markdown(f"### ππ {selected_paper['0_additional_depth_q:follow up question']}") | |
depth_q_eli5_0 = gr.Markdown(f"βͺ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}", elem_classes=["small-font"]) | |
depth_q_expert_0 = gr.Markdown(f"βͺ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"]) | |
with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_0_1: | |
breath_q_0 = gr.Markdown(f"### ππ {selected_paper['0_additional_breath_q:follow up question']}") | |
breath_q_eli5_0 = gr.Markdown(f"βͺ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}", elem_classes=["small-font"]) | |
breath_q_expert_0 = gr.Markdown(f"βͺ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"]) | |
# 2 | |
with gr.Column(elem_classes=["group"], visible=True) as q_1: | |
basic_q_1 = gr.Markdown(f"### π {selected_paper['1_question']}") | |
basic_q_eli5_1 = gr.Markdown(f"βͺ **(ELI5)** {selected_paper['1_answers:eli5']}", elem_classes=["small-font"]) | |
basic_q_expert_1 = gr.Markdown(f"βͺ **(Technical)** {selected_paper['1_answers:expert']}", visible=False, elem_classes=["small-font"]) | |
with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_1_0: | |
depth_q_1 = gr.Markdown(f"### ππ {selected_paper['1_additional_depth_q:follow up question']}") | |
depth_q_eli5_1 = gr.Markdown(f"βͺ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}", elem_classes=["small-font"]) | |
depth_q_expert_1 = gr.Markdown(f"βͺ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"]) | |
with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_1_1: | |
breath_q_1 = gr.Markdown(f"### ππ {selected_paper['1_additional_breath_q:follow up question']}") | |
breath_q_eli5_1 = gr.Markdown(f"βͺ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}", elem_classes=["small-font"]) | |
breath_q_expert_1 = gr.Markdown(f"βͺ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"]) | |
# 3 | |
with gr.Column(elem_classes=["group"], visible=True) as q_2: | |
basic_q_2 = gr.Markdown(f"### π {selected_paper['2_question']}") | |
basic_q_eli5_2 = gr.Markdown(f"βͺ **(ELI5)** {selected_paper['2_answers:eli5']}", elem_classes=["small-font"]) | |
basic_q_expert_2 = gr.Markdown(f"βͺ **(Technical)** {selected_paper['2_answers:expert']}", visible=False, elem_classes=["small-font"]) | |
with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_2_0: | |
depth_q_2 = gr.Markdown(f"### ππ {selected_paper['2_additional_depth_q:follow up question']}") | |
depth_q_eli5_2 = gr.Markdown(f"βͺ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}", elem_classes=["small-font"]) | |
depth_q_expert_2 = gr.Markdown(f"βͺ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"]) | |
with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_2_1: | |
breath_q_2 = gr.Markdown(f"### ππ {selected_paper['2_additional_breath_q:follow up question']}") | |
breath_q_eli5_2 = gr.Markdown(f"βͺ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}", elem_classes=["small-font"]) | |
breath_q_expert_2 = gr.Markdown(f"βͺ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"]) | |
gr.Markdown("## Request any arXiv ids") | |
arxiv_queue = gr.Dataframe( | |
headers=["Requested arXiv IDs"], col_count=(1, "fixed"), | |
value=requested_arxiv_ids_df, | |
datatype=["str"], | |
interactive=False | |
) | |
arxiv_id_enter = gr.Textbox(placeholder="Enter comma separated arXiv IDs...", elem_classes=["textbox-no-label"]) | |
arxiv_id_enter.submit( | |
add_arxiv_ids_to_queue, | |
[arxiv_queue, arxiv_id_enter], | |
[arxiv_queue, arxiv_id_enter] | |
) | |
gr.Markdown("The target papers are collected from [Hugging Face π€ Daily Papers](https://huggingface.co/papers) on a daily basis. " | |
"The entire data is generated by [Google's Gemini 1.0](https://deepmind.google/technologies/gemini/) Pro. " | |
"If you are curious how it is done, visit the [Auto Paper Q&A Generation project repository](https://github.com/deep-diver/auto-paper-analysis) " | |
"Also, the generated dataset is hosted on Hugging Face π€ Dataset repository as well([Link](https://huggingface.co/datasets/chansung/auto-paper-qa2)). ") | |
search_r1.click(set_date, search_r1, [year_dd, month_dd, day_dd]).then( | |
set_papers, | |
inputs=[year_dd, month_dd, day_dd, search_r1], | |
outputs=[papers_dd, search_in] | |
) | |
search_r2.click(set_date, search_r2, [year_dd, month_dd, day_dd]).then( | |
set_papers, | |
inputs=[year_dd, month_dd, day_dd, search_r2], | |
outputs=[papers_dd, search_in] | |
) | |
search_r3.click(set_date, search_r3, [year_dd, month_dd, day_dd]).then( | |
set_papers, | |
inputs=[year_dd, month_dd, day_dd, search_r3], | |
outputs=[papers_dd, search_in] | |
) | |
search_r4.click(set_date, search_r4, [year_dd, month_dd, day_dd]).then( | |
set_papers, | |
inputs=[year_dd, month_dd, day_dd, search_r4], | |
outputs=[papers_dd, search_in] | |
) | |
search_r5.click(set_date, search_r5, [year_dd, month_dd, day_dd]).then( | |
set_papers, | |
inputs=[year_dd, month_dd, day_dd, search_r5], | |
outputs=[papers_dd, search_in] | |
) | |
search_r6.click(set_date, search_r6, [year_dd, month_dd, day_dd]).then( | |
set_papers, | |
inputs=[year_dd, month_dd, day_dd, search_r6], | |
outputs=[papers_dd, search_in] | |
) | |
search_r7.click(set_date, search_r7, [year_dd, month_dd, day_dd]).then( | |
set_papers, | |
inputs=[year_dd, month_dd, day_dd, search_r7], | |
outputs=[papers_dd, search_in] | |
) | |
search_r8.click(set_date, search_r8, [year_dd, month_dd, day_dd]).then( | |
set_papers, | |
inputs=[year_dd, month_dd, day_dd, search_r8], | |
outputs=[papers_dd, search_in] | |
) | |
search_r9.click(set_date, search_r9, [year_dd, month_dd, day_dd]).then( | |
set_papers, | |
inputs=[year_dd, month_dd, day_dd, search_r9], | |
outputs=[papers_dd, search_in] | |
) | |
search_r10.click(set_date, search_r10, [year_dd, month_dd, day_dd]).then( | |
set_papers, | |
inputs=[year_dd, month_dd, day_dd, search_r10], | |
outputs=[papers_dd, search_in] | |
) | |
year_dd.input( | |
get_paper_by_year, | |
inputs=[year_dd], | |
outputs=[month_dd, day_dd, papers_dd] | |
).then( | |
set_paper, | |
[year_dd, month_dd, day_dd, papers_dd], | |
[ | |
title, summary, | |
basic_q_0, basic_q_eli5_0, basic_q_expert_0, | |
depth_q_0, depth_q_eli5_0, depth_q_expert_0, | |
breath_q_0, breath_q_eli5_0, breath_q_expert_0, | |
basic_q_1, basic_q_eli5_1, basic_q_expert_1, | |
depth_q_1, depth_q_eli5_1, depth_q_expert_1, | |
breath_q_1, breath_q_eli5_1, breath_q_expert_1, | |
basic_q_2, basic_q_eli5_2, basic_q_expert_2, | |
depth_q_2, depth_q_eli5_2, depth_q_expert_2, | |
breath_q_2, breath_q_eli5_2, breath_q_expert_2 | |
] | |
) | |
month_dd.input( | |
get_paper_by_month, | |
inputs=[year_dd, month_dd], | |
outputs=[day_dd, papers_dd] | |
).then( | |
set_paper, | |
[year_dd, month_dd, day_dd, papers_dd], | |
[ | |
title, arxiv_link, hf_paper_link, summary, | |
basic_q_0, basic_q_eli5_0, basic_q_expert_0, | |
depth_q_0, depth_q_eli5_0, depth_q_expert_0, | |
breath_q_0, breath_q_eli5_0, breath_q_expert_0, | |
basic_q_1, basic_q_eli5_1, basic_q_expert_1, | |
depth_q_1, depth_q_eli5_1, depth_q_expert_1, | |
breath_q_1, breath_q_eli5_1, breath_q_expert_1, | |
basic_q_2, basic_q_eli5_2, basic_q_expert_2, | |
depth_q_2, depth_q_eli5_2, depth_q_expert_2, | |
breath_q_2, breath_q_eli5_2, breath_q_expert_2 | |
] | |
) | |
day_dd.input( | |
get_paper_by_day, | |
inputs=[year_dd, month_dd, day_dd], | |
outputs=[papers_dd] | |
).then( | |
set_paper, | |
[year_dd, month_dd, day_dd, papers_dd], | |
[ | |
title, arxiv_link, hf_paper_link, summary, | |
basic_q_0, basic_q_eli5_0, basic_q_expert_0, | |
depth_q_0, depth_q_eli5_0, depth_q_expert_0, | |
breath_q_0, breath_q_eli5_0, breath_q_expert_0, | |
basic_q_1, basic_q_eli5_1, basic_q_expert_1, | |
depth_q_1, depth_q_eli5_1, depth_q_expert_1, | |
breath_q_1, breath_q_eli5_1, breath_q_expert_1, | |
basic_q_2, basic_q_eli5_2, basic_q_expert_2, | |
depth_q_2, depth_q_eli5_2, depth_q_expert_2, | |
breath_q_2, breath_q_eli5_2, breath_q_expert_2 | |
] | |
) | |
papers_dd.change( | |
set_paper, | |
[year_dd, month_dd, day_dd, papers_dd], | |
[ | |
title, arxiv_link, hf_paper_link, summary, | |
basic_q_0, basic_q_eli5_0, basic_q_expert_0, | |
depth_q_0, depth_q_eli5_0, depth_q_expert_0, | |
breath_q_0, breath_q_eli5_0, breath_q_expert_0, | |
basic_q_1, basic_q_eli5_1, basic_q_expert_1, | |
depth_q_1, depth_q_eli5_1, depth_q_expert_1, | |
breath_q_1, breath_q_eli5_1, breath_q_expert_1, | |
basic_q_2, basic_q_eli5_2, basic_q_expert_2, | |
depth_q_2, depth_q_eli5_2, depth_q_expert_2, | |
breath_q_2, breath_q_eli5_2, breath_q_expert_2 | |
] | |
) | |
search_in.change( | |
inputs=[search_in], | |
outputs=[ | |
search_r1, search_r2, search_r3, search_r4, search_r5, | |
search_r6, search_r7, search_r8, search_r9, search_r10 | |
], | |
js=UPDATE_SEARCH_RESULTS % str(list(titles)), | |
fn=None | |
) | |
exp_type.select( | |
change_exp_type, | |
exp_type, | |
[ | |
basic_q_eli5_0, basic_q_expert_0, depth_q_eli5_0, depth_q_expert_0, breath_q_eli5_0, breath_q_expert_0, | |
basic_q_eli5_1, basic_q_expert_1, depth_q_eli5_1, depth_q_expert_1, breath_q_eli5_1, breath_q_expert_1, | |
basic_q_eli5_2, basic_q_expert_2, depth_q_eli5_2, depth_q_expert_2, breath_q_eli5_2, breath_q_expert_2 | |
] | |
) | |
conv_type.select( | |
inputs=[conv_type], | |
js=UPDATE_IF_TYPE, | |
outputs=None, | |
fn=None | |
) | |
start_date = datetime.now() + timedelta(minutes=1) | |
scheduler = BackgroundScheduler() | |
scheduler.add_job( | |
process_arxiv_ids, | |
trigger='interval', | |
seconds=3600, | |
args=[ | |
gemini_api_key, | |
dataset_repo_id, | |
request_arxiv_repo_id, | |
hf_token | |
], | |
start_date=start_date | |
) | |
scheduler.start() | |
demo.launch(share=True, debug=True) |