blue-arxiv / app.py
yoinked's picture
Update app.py
d6e3fde verified
raw
history blame
6.1 kB
import huggingface_hub as hf
import gradio as gr
import os, datetime
fs = hf.HfFileSystem(token=os.environ["HF_TOKEN"])
datasetdir = "datasets/yoinked/blue-arxiv-papers/"
basecss = """
.caaard-container {
width: 250px;
padding: 20px;
border: 3px solid black;
border-radius: 15px;
text-align: left;
}
.title {
font-size: 24px;
margin-bottom: 10px;
text-align: center;
}
.caaard-containers {
display: flex; gap: 20px; flex-wrap: wrap;
}
.extra-info {
font-size: 14px;
line-height: 1.5;
}
.extra-info-paperid {
font-size: 14px;
line-height: 1.5;
color: #222;
}"""
jscode = """
function copyToClipboard(container) {
const titleElement = container.querySelector(".title");
const titleText = titleElement.textContent;
const tempTextArea = document.createElement("textarea");
tempTextArea.value = titleText;
document.body.appendChild(tempTextArea);
tempTextArea.select();
document.execCommand("copy");
document.body.removeChild(tempTextArea);
}
"""
def get_papers():
return fs.glob(datasetdir+"**.md")
def get_papers_metadata(papiers=None):
metadatas = []
if papiers is None:
papiers = get_papers()
for paper in papiers:
papertxt = fs.read(paper)
metadata = papertxt.split("---")[1]
try:
author = metadata.split("author: ")[1].split("\n")[0]
except:
author = "unknown"
try:
title = metadata.split("title: ")[1].split("\n")[0]
except:
title = "unknown"
try:
tags = metadata.split("tags: ")[1].split("\n")[0].split(", ")
except:
tags = []
try:
abstract = metadata.split("abstract: ")[1].split("\n")[0]
except:
abstract = "unknown"
try:
date_published = metadata.split("date_published: ")[1].split("\n")[0]
except:
date_published = "unknown"
try:
paperid = metadata.split("paperid: ")[1].split("\n")[0] #if this fails then no reason to display
md = {"fname": paper, "metadata": metadata, "author": author, "title": title, "tags": tags, "abstract": abstract, "date_published": date_published, "paperid": paperid}
metadatas.append(md)
except:
pass
return metadatas
def make_paper_card(md):
html = f"""
<div class="caaard-container" onclick="copyToClipboard(this)" title="{md["abstract"]}">>
<div class="title">{md["title"]}</div>
<br><br>
<div class="extra-info">author: {md["author"]}</div>
<div class="extra-info">published: {md["date_published"]}</div>
<div class="extra-info-paperid">id: {md["paperid"]}</div>
</div>
"""
return html
def make_paper_cards(tags=None):
mds = get_papers_metadata()
tags = tags.split(",")
tags = [tag.strip() for tag in tags]
if tags is not None:
mds = [md for md in mds if any(tag in md["tags"] for tag in tags)]
htmls = [make_paper_card(md) for md in mds]
fin = "<div class='caaard-containers'>"
for html in htmls:
fin += html + "<br>"
fin += "</div>"
return fin
def get_paper_markdown(paperid):
allpapers = get_papers_metadata()
fname = None
for paper in allpapers:
if paper["paperid"] == paperid:
fname = paper["fname"]
break
if fname is None:
return "## paper not found"
else:
paper = fs.read(fname).split("---")[2]
return paper
def publish_paper(title, authors, tags, abst, data):
paperid = ""
year = datetime.datetime.now().year
month = datetime.datetime.now().month
if month < 10:
month = "0"+str(month)
day = datetime.datetime.now().day
if day < 10:
day = "0"+str(day)
idx = 1
while True:
paperid = f"{year}-{month}{day}.{idx}"
if not fs.exists(datasetdir+paperid+".md"):
break
idx += 1
if idx > 100:
return "could not generate paperid, try again tomorrow"
bad_chars = "<>:|\\" # primitive anti-xss sanitization
for c in bad_chars:
title = title.replace(c, "")
authors = authors.replace(c, "")
tags = tags.replace(c, "")
abst = abst.replace(c, "")
metadata = f"""---
title: {title}
author: {authors}
tags: {tags}
abstract: {abst}
date_published: {year}-{month}-{day}
paperid: {paperid}
---\n"""
with fs.open(datasetdir+paperid+".md", "w") as f:
raw = metadata + data
f.write(raw)
def makepreview(x):
return x
with gr.Blocks(css=basecss, js=jscode, theme='NoCrypt/miku') as demo:
with gr.Tab("search"):
with gr.Row():
query = gr.Textbox(label="tags (optional, comma seperated)", lines=1, interactive=True)
searchbutton = gr.Button("search")
with gr.Row():
papercards = gr.HTML()
with gr.Tab("read"):
with gr.Row():
paperid = gr.Textbox(label="paper id", lines=1, interactive=True)
readbutton = gr.Button("read")
with gr.Row():
paper = gr.Markdown()
with gr.Tab("publish"):
with gr.Row():
title = gr.Textbox(label="title", lines=1, interactive=True)
authors = gr.Textbox(label="author(s)", lines=1, interactive=True)
with gr.Row():
tags = gr.Textbox(label="tags (optional, comma seperated)", lines=1, interactive=True)
abst = gr.Textbox(label="abriged abstract (aka tooltip)", lines=2, interactive=True)
markd = gr.Textbox(label="markdown", lines=10, interactive=True, max_lines=1e3)
preview = gr.Markdown()
with gr.Row():
status = gr.Textbox(label="status", lines=1, interactive=False)
publishbutton = gr.Button("publish")
markd.change(fn=makepreview, inputs=markd, outputs=preview)
publishbutton.click(fn=publish_paper, inputs=[title, authors, tags, abst, markd], outputs=status)
searchbutton.click(fn=make_paper_cards, inputs=query, outputs=papercards)
readbutton.click(fn=get_paper_markdown, inputs=paperid, outputs=paper)
demo.launch()