|
|
|
|
|
|
|
|
|
import tempfile
|
|
from datetime import datetime
|
|
import requests
|
|
|
|
from App_Function_Libraries.PDF.PDF_Ingestion_Lib import extract_text_and_format_from_pdf
|
|
|
|
|
|
from App_Function_Libraries.Third_Party.Arxiv import convert_xml_to_markdown, fetch_arxiv_xml, parse_arxiv_feed, \
|
|
build_query_url, ARXIV_PAGE_SIZE, fetch_arxiv_pdf_url
|
|
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
|
|
|
|
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
def create_arxiv_tab():
|
|
with gr.TabItem("Arxiv Search & Ingest", visible=True):
|
|
gr.Markdown("# arXiv Search, Browse, Download, and Ingest")
|
|
gr.Markdown("#### Thank you to arXiv for use of its open access interoperability.")
|
|
with gr.Row():
|
|
with gr.Column(scale=1):
|
|
|
|
with gr.Row():
|
|
with gr.Column():
|
|
search_query = gr.Textbox(label="Search Query", placeholder="e.g., machine learning")
|
|
author_filter = gr.Textbox(label="Author", placeholder="e.g., John Doe")
|
|
year_filter = gr.Number(label="Year", precision=0)
|
|
search_button = gr.Button("Search")
|
|
|
|
with gr.Column(scale=2):
|
|
|
|
paper_selector = gr.Radio(label="Select a Paper", choices=[], interactive=True)
|
|
prev_button = gr.Button("Previous Page")
|
|
next_button = gr.Button("Next Page")
|
|
page_info = gr.Textbox(label="Page", value="1", interactive=False)
|
|
|
|
|
|
with gr.Row():
|
|
with gr.Column():
|
|
|
|
paper_view = gr.Markdown(label="Paper Details")
|
|
arxiv_keywords = gr.Textbox(label="Additional Keywords (comma-separated)",
|
|
placeholder="e.g., AI, Deep Learning")
|
|
ingest_button = gr.Button("Ingest Selected Paper")
|
|
ingest_result = gr.Textbox(label="Ingestion Result", interactive=False)
|
|
|
|
|
|
state = gr.State(value={"start": 0, "current_page": 1, "last_query": None, "entries": []})
|
|
selected_paper_id = gr.State(value=None)
|
|
|
|
def search_arxiv(query, author, year):
|
|
start = 0
|
|
url = build_query_url(query, author, year, start)
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
except requests.exceptions.RequestException as e:
|
|
return gr.update(value=[]), gr.update(value=f"**Error:** {str(e)}"), state.value
|
|
|
|
entries = parse_arxiv_feed(response.text)
|
|
state.value = {"start": start, "current_page": 1, "last_query": (query, author, year), "entries": entries}
|
|
if not entries:
|
|
return gr.update(value=[]), "No results found.", state.value
|
|
|
|
|
|
titles = [entry['title'] for entry in entries]
|
|
return gr.update(choices=titles), "1", state.value
|
|
|
|
|
|
def handle_pagination(direction):
|
|
current_state = state.value
|
|
query, author, year = current_state["last_query"]
|
|
new_page = current_state["current_page"] + direction
|
|
if new_page < 1:
|
|
new_page = 1
|
|
start = (new_page - 1) * ARXIV_PAGE_SIZE
|
|
url = build_query_url(query, author, year, start)
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
except requests.exceptions.RequestException as e:
|
|
return gr.update(), gr.update()
|
|
|
|
entries = parse_arxiv_feed(response.text)
|
|
if entries:
|
|
current_state["start"] = start
|
|
current_state["current_page"] = new_page
|
|
current_state["entries"] = entries
|
|
state.value = current_state
|
|
|
|
|
|
titles = [entry['title'] for entry in entries]
|
|
return gr.update(choices=titles), str(new_page)
|
|
else:
|
|
|
|
return gr.update(), gr.update()
|
|
|
|
def load_selected_paper(selected_title):
|
|
if not selected_title:
|
|
return "Please select a paper to view."
|
|
|
|
|
|
for entry in state.value["entries"]:
|
|
if entry['title'] == selected_title:
|
|
paper_id = entry['id']
|
|
break
|
|
else:
|
|
return "Paper not found."
|
|
|
|
try:
|
|
|
|
pdf_url = fetch_arxiv_pdf_url(paper_id)
|
|
response = requests.get(pdf_url)
|
|
response.raise_for_status()
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
|
|
temp_pdf.write(response.content)
|
|
temp_pdf_path = temp_pdf.name
|
|
|
|
|
|
full_text_markdown = extract_text_and_format_from_pdf(temp_pdf_path)
|
|
|
|
selected_paper_id.value = paper_id
|
|
return full_text_markdown
|
|
except Exception as e:
|
|
return f"Error loading full paper: {str(e)}"
|
|
|
|
def process_and_ingest_arxiv_paper(paper_id, additional_keywords):
|
|
try:
|
|
if not paper_id:
|
|
return "Please select a paper to ingest."
|
|
|
|
|
|
pdf_url = fetch_arxiv_pdf_url(paper_id)
|
|
|
|
|
|
response = requests.get(pdf_url)
|
|
response.raise_for_status()
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
|
|
temp_pdf.write(response.content)
|
|
temp_pdf_path = temp_pdf.name
|
|
|
|
|
|
markdown_text = extract_text_and_format_from_pdf(temp_pdf_path)
|
|
|
|
|
|
xml_content = fetch_arxiv_xml(paper_id)
|
|
_, title, authors, categories = convert_xml_to_markdown(xml_content)
|
|
|
|
|
|
paper_url = f"https://arxiv.org/abs/{paper_id}"
|
|
|
|
|
|
keywords = f"arxiv,{','.join(categories)}"
|
|
if additional_keywords:
|
|
keywords += f",{additional_keywords}"
|
|
|
|
|
|
add_media_with_keywords(
|
|
url=paper_url,
|
|
title=title,
|
|
media_type='document',
|
|
content=markdown_text,
|
|
keywords=keywords,
|
|
prompt='No prompt for arXiv papers',
|
|
summary='Full arXiv paper ingested from PDF',
|
|
transcription_model='None',
|
|
author=', '.join(authors),
|
|
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
|
)
|
|
|
|
|
|
return f"arXiv paper '{title}' by {', '.join(authors)} ingested successfully."
|
|
except Exception as e:
|
|
|
|
return f"Error processing arXiv paper: {str(e)}"
|
|
|
|
|
|
|
|
search_button.click(
|
|
fn=search_arxiv,
|
|
inputs=[search_query, author_filter, year_filter],
|
|
outputs=[paper_selector, page_info, state],
|
|
queue=True
|
|
)
|
|
|
|
|
|
next_button.click(
|
|
fn=lambda: handle_pagination(1),
|
|
inputs=None,
|
|
outputs=[paper_selector, page_info],
|
|
queue=True
|
|
)
|
|
|
|
|
|
prev_button.click(
|
|
fn=lambda: handle_pagination(-1),
|
|
inputs=None,
|
|
outputs=[paper_selector, page_info],
|
|
queue=True
|
|
)
|
|
|
|
|
|
paper_selector.change(
|
|
fn=load_selected_paper,
|
|
inputs=paper_selector,
|
|
outputs=paper_view,
|
|
queue=True
|
|
)
|
|
|
|
|
|
ingest_button.click(
|
|
fn=process_and_ingest_arxiv_paper,
|
|
inputs=[selected_paper_id, arxiv_keywords],
|
|
outputs=ingest_result,
|
|
queue=True
|
|
)
|
|
|
|
|
|
|
|
|
|
|