ArxivCopilot / arxiv_agent.py
Guanyu-Lin
email
28ea05d
import os
import pickle
import json
import time
import datetime
from xml.etree import ElementTree
from huggingface_hub import CommitScheduler
from huggingface_hub import HfApi
from pathlib import Path
import requests
from datasets import load_dataset_builder
import warnings
warnings.filterwarnings("ignore")
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from utils import *
import thread6
MAX_DAILY_PAPER = int(os.environ['MAX_DAILY_PAPER'])
DAY_TIME = 60 * 60 * 24
DAY_TIME_MIN = 60 * 24
DATA_REPO_ID = "cmulgy/ArxivCopilot_data"
READ_WRITE_TOKEN = os.environ['READ_WRITE']
api = HfApi(token = READ_WRITE_TOKEN)
DATASET_DIR = Path(".")
DATASET_DIR.mkdir(parents=True, exist_ok=True)
from huggingface_hub import hf_hub_download
scheduler = CommitScheduler(
repo_id=DATA_REPO_ID,
repo_type="dataset",
folder_path=DATASET_DIR,
path_in_repo=".",
hf_api = api,
every = DAY_TIME_MIN,
)
def feedback_thought(input_ls): # preload
agent, query, ansA, ansB, feedbackA, feedbackB = input_ls
filename_thought = agent.thought_path
filename = agent.feedback_path
date = agent.today
json_data = agent.feedback
json_data_thought = agent.thought
if date in json_data:
if query not in json_data[date]:
json_data[date][query] = {}
else:
json_data[date] = {}
json_data[date][query] = {}
if date not in json_data_thought:
json_data_thought[date] = []
json_data[date][query]["answerA"] = (ansA)
json_data[date][query]["feedbackA"] = feedbackA
json_data[date][query]["answerB"] = (ansB)
json_data[date][query]["feedbackB"] = feedbackB
with scheduler.lock:
with open(filename,"w") as f:
json.dump(json_data,f)
preferred_ans = ""
if feedbackA == 1:
new_knowledge = response_verify([query], [ansA], verify=False)
preferred_ans = ansA
# json_data_thought[date].append(query + ansA)
else:
new_knowledge = response_verify([query], [ansB], verify=False)
preferred_ans = ansB
# json_data_thought[date].append(query + ansB)
if ('idk' not in new_knowledge[0]):
new_knowledge_embedding = get_bert_embedding(new_knowledge)
thought_embedding_all = []
for k in agent.thought_embedding.keys():
thought_embedding_all.extend(agent.thought_embedding[k])
similarity = calculate_similarity(thought_embedding_all, new_knowledge_embedding[0])
similarity_values = [s.item() for s in similarity] # Convert each tensor to a scalar
if all(s < 0.85 for s in similarity_values):
# self.update_feedback(an, answer_l_org, query)
tem_thought = query + preferred_ans
json_data_thought[date].append(tem_thought)
if date not in agent.thought_embedding:
agent.thought_embedding = {}
agent.thought_embedding[date] = [get_bert_embedding([tem_thought])[0]]
else:
agent.thought_embedding[date].append(get_bert_embedding([tem_thought])[0])
with scheduler.lock:
with open(filename_thought,"w") as f:
json.dump(json_data_thought,f)
with open(agent.thought_embedding_path, "wb") as f:
pickle.dump(agent.thought_embedding, f)
# return "Give feedback successfully!"
def dailyDownload(agent_ls):
agent = agent_ls[0]
while True:
time.sleep(DAY_TIME)
data_collector = []
keywords = dict()
keywords["Machine Learning"] = "Machine Learning"
for topic,keyword in keywords.items():
data, agent.newest_day = get_daily_papers(topic, query = keyword, max_results = MAX_DAILY_PAPER)
data_collector.append(data)
json_file = agent.dataset_path
update_file=update_json_file(json_file, data_collector, scheduler)
time_chunks_embed={}
for data in data_collector:
for date in data.keys():
papers = data[date]['abstract']
papers_embedding=get_bert_embedding(papers)
time_chunks_embed[date.strftime("%m/%d/%Y")] = papers_embedding
update_paper_file=update_pickle_file(agent.embedding_path,time_chunks_embed, scheduler)
agent.paper = update_file
agent.paper_embedding = update_paper_file
print("Today is " + agent.newest_day.strftime("%m/%d/%Y"))
def dailySave(agent_ls):
agent = agent_ls[0]
while True:
time.sleep(DAY_TIME)
with scheduler.lock:
with open(agent.trend_idea_path, "w") as f_:
json.dump(agent.trend_idea, f_)
with open(agent.thought_path, "w") as f_:
json.dump(agent.thought, f_)
with open(agent.thought_embedding_path, "wb") as f:
pickle.dump(agent.thought_embedding, f)
with open(agent.profile_path,"w") as f:
json.dump(agent.profile,f)
with open(agent.comment_path,"w") as f:
json.dump(agent.comment,f)
class ArxivAgent:
def __init__(self):
self.dataset_path = DATASET_DIR / "dataset/paper.json"
self.thought_path = DATASET_DIR / "dataset/thought.json"
self.trend_idea_path = DATASET_DIR / "dataset/trend_idea.json"
self.profile_path = DATASET_DIR / "dataset/profile.json"
self.email_pool_path = DATASET_DIR / "dataset/email.json"
self.comment_path = DATASET_DIR / "dataset/comment.json"
self.embedding_path = DATASET_DIR / "dataset/paper_embedding.pkl"
self.thought_embedding_path = DATASET_DIR / "dataset/thought_embedding.pkl"
self.feedback_path = DATASET_DIR / "dataset/feedback.json"
self.today = datetime.datetime.now().strftime("%m/%d/%Y")
self.newest_day = ""
# import pdb
# pdb.set_trace()
self.load_cache()
self.download()
try:
thread6.run_threaded(dailyDownload, [self])
thread6.run_threaded(dailySave, [self])
except:
print("Error: unable to start thread")
def edit_profile(self, profile, author_name):
self.profile[author_name]=profile
return "Successfully edit profile!"
def sign_email(self, profile, email):
self.email_pool[email]=profile
with scheduler.lock:
with open(self.email_pool_path,"w") as f:
json.dump(self.email_pool,f)
return "Successfully sign up!"
def get_profile(self, author_name):
if author_name == "": return None
profile = self.get_arxiv_data_by_author(author_name)
return profile
def select_date(self, method, profile_input):
today = self.newest_day
chunk_embedding_date={}
paper_by_date = {}
if method == "day":
offset_day = today
str_day = offset_day.strftime("%m/%d/%Y")
if str_day in self.paper:
paper_by_date[str_day] = self.paper[str_day]
chunk_embedding_date[str_day]=self.paper_embedding[str_day]
elif method == "week":
for i in range(7):
offset_day = today - datetime.timedelta(days=i)
str_day = offset_day.strftime("%m/%d/%Y")
if str_day in self.paper:
# print(str_day)
paper_by_date[str_day] = self.paper[str_day]
chunk_embedding_date[str_day] = self.paper_embedding[str_day]
elif method == "month":
for i in range(30):
offset_day = today - datetime.timedelta(days=i)
str_day = offset_day.strftime("%m/%d/%Y")
if str_day in self.paper:
# print(str_day)
paper_by_date[str_day] = self.paper[str_day]
chunk_embedding_date[str_day] = self.paper_embedding[str_day]
else:
# import pdb
# pdb.set_trace()
paper_by_date = self.paper
chunk_embedding_date=self.paper_embedding
dataset = paper_by_date
data_chunk_embedding=chunk_embedding_date
profile = profile_input
key_update = list(self.paper.keys())[-1]
isQuery = False
if profile in self.trend_idea:
if key_update in self.trend_idea[profile]:
if method in self.trend_idea[profile][key_update]:
trend = self.trend_idea[profile][key_update][method]["trend"]
reference = self.trend_idea[profile][key_update][method]["reference"]
idea = self.trend_idea[profile][key_update][method]["idea"]
isQuery = True
if not(isQuery):
trend, paper_link = summarize_research_field(profile, "Machine Learning", dataset,data_chunk_embedding) # trend
reference = papertitleAndLink(paper_link)
idea = generate_ideas(trend) # idea
if profile in self.trend_idea:
if key_update in self.trend_idea[profile]:
if not(method in self.trend_idea[profile][key_update]):
self.trend_idea[profile][key_update][method] = {}
else:
self.trend_idea[profile][key_update] = {}
self.trend_idea[profile][key_update][method] = {}
else:
self.trend_idea[profile] = {}
self.trend_idea[profile][key_update] = {}
self.trend_idea[profile][key_update][method] = {}
self.trend_idea[profile][key_update][method]["trend"] = trend
self.trend_idea[profile][key_update][method]["reference"] = reference
self.trend_idea[profile][key_update][method]["idea"] = idea
if key_update not in self.thought:
self.thought[key_update] = []
if key_update not in self.thought_embedding:
self.thought_embedding[key_update] = []
self.thought[key_update].append(trend[0])
self.thought_embedding[key_update].append(get_bert_embedding([trend])[0])
self.thought[key_update].append(idea[0])
self.thought_embedding[key_update].append(get_bert_embedding([idea])[0])
return trend, reference, idea
def response(self, data, profile_input):
query = [data]
profile = profile_input
query_embedding=get_bert_embedding(query)
retrieve_text,retrieve_text_org=self.generate_pair_retrieve_text(query_embedding)
context,context_org = [retrieve_text],[retrieve_text_org]
answer_l = get_response_through_LLM_answer(query, context,profile)
answer_l_org = get_response_through_LLM_answer(query, context_org, profile)
return answer_l,answer_l_org
def generate_pair_retrieve_text(self, query_embedding):
# Access dataset
dataset = self.paper
thought = self.thought
text_chunk_l = []
chunks_embedding_text_all = []
text_org_chunk_l = []
chunks_org_embedding_text_all = []
# Include all text chunks and their embeddings
for k in dataset.keys():
text_chunk_l.extend(dataset[k]['abstract'])
chunks_embedding_text_all.extend(self.paper_embedding[k])
text_org_chunk_l.extend(dataset[k]['abstract'])
chunks_org_embedding_text_all.extend(self.paper_embedding[k])
for k in thought.keys():
if k in self.thought_embedding.keys():
text_chunk_l.extend(thought[k])
chunks_embedding_text_all.extend(self.thought_embedding[k])
# Include thoughts if not excluded
neib_all = neiborhood_search(chunks_embedding_text_all, query_embedding, num=10)
neib_all = neib_all.reshape(-1)
# import pdb
# pdb.set_trace()
# Compile retrieved text
# import pdb
# pdb.set_trace()
retrieve_text = ''.join([text_chunk_l[i] for i in neib_all])
neib_all = neiborhood_search(chunks_org_embedding_text_all, query_embedding, num=10)
neib_all = neib_all.reshape(-1)
# Compile retrieved text
retrieve_text_org = ''.join([text_org_chunk_l[i] for i in neib_all])
return retrieve_text,retrieve_text_org
def download(self):
# key_word = "Machine Learning"
data_collector = []
keywords = dict()
keywords["Machine Learning"] = "Machine Learning"
for topic,keyword in keywords.items():
data, self.newest_day = get_daily_papers(topic, query = keyword, max_results = MAX_DAILY_PAPER)
data_collector.append(data)
json_file = self.dataset_path
try:
hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/paper.json", local_dir = ".", repo_type="dataset")
except:
with open(json_file,'w')as a:
print(json_file)
update_file=update_json_file(json_file, data_collector, scheduler)
try:
hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/paper_embedding.pkl", local_dir = ".", repo_type="dataset")
except:
with open(self.embedding_path,'wb')as a:
print(self.embedding_path)
time_chunks_embed={}
for data in data_collector:
for date in data.keys():
papers = data[date]['abstract']
papers_embedding=get_bert_embedding(papers)
time_chunks_embed[date.strftime("%m/%d/%Y")] = papers_embedding
update_paper_file=update_pickle_file(self.embedding_path,time_chunks_embed, scheduler)
self.paper = update_file
self.paper_embedding = update_paper_file
def load_cache(self):
filename = self.feedback_path
try:
hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/feedback.json", local_dir = ".", repo_type="dataset")
with open(filename,"rb") as f:
content = f.read()
if not content:
m = {}
else:
m = json.loads(content)
except:
with open(filename, mode='w', encoding='utf-8') as ff:
m = {}
self.feedback = m.copy()
filename = self.trend_idea_path
# if os.path.exists(filename):
try:
hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/trend_idea.json", local_dir = ".", repo_type="dataset")
with open(filename,"rb") as f:
content = f.read()
if not content:
m = {}
else:
m = json.loads(content)
except:
with open(filename, mode='w', encoding='utf-8') as ff:
m = {}
self.trend_idea = m.copy()
filename = self.profile_path
# if os.path.exists(filename):
try:
hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/profile.json", local_dir = ".", repo_type="dataset")
with open(filename,"rb") as f:
content = f.read()
if not content:
m = {}
else:
m = json.loads(content)
except:
with open(filename, mode='w', encoding='utf-8') as ff:
m = {}
self.profile = m.copy()
filename = self.email_pool_path
# if os.path.exists(filename):
try:
hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/email.json", local_dir = ".", repo_type="dataset")
with open(filename,"rb") as f:
content = f.read()
if not content:
m = {}
else:
m = json.loads(content)
except:
with open(filename, mode='w', encoding='utf-8') as ff:
m = {}
self.email_pool = m.copy()
filename = self.thought_path
filename_emb = self.thought_embedding_path
# if os.path.exists(filename):
try:
hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/thought.json", local_dir = ".", repo_type="dataset")
with open(filename,"rb") as f:
content = f.read()
if not content:
m = {}
else:
m = json.loads(content)
except:
with open(filename, mode='w', encoding='utf-8') as ff:
m = {}
# if os.path.exists(filename_emb):
try:
hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/thought_embedding.pkl", local_dir = ".", repo_type="dataset")
with open(filename_emb,"rb") as f:
content = f.read()
if not content:
m_emb = {}
else:
m_emb = pickle.loads(content)
except:
with open(filename_emb, mode='w', encoding='utf-8') as ff:
m_emb = {}
self.thought = m.copy()
self.thought_embedding = m_emb.copy()
filename = self.comment_path
# if os.path.exists(filename):
try:
hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/comment.json", local_dir = ".", repo_type="dataset")
with open(filename,"r") as f:
content = f.read()
if not content:
m = {}
else:
m = json.loads(content)
except:
with open(filename, mode='w', encoding='utf-8') as ff:
m = {}
self.comment = m.copy()
def update_feedback_thought(self, query, ansA, ansB, feedbackA, feedbackB):
try:
thread6.run_threaded(feedback_thought, [self, query, ansA, ansB, feedbackA, feedbackB])
# thread6.start_new_thread( print_time, ["Thread-2", 4] )
except:
print("Error: unable to start thread")
def update_comment(self, comment):
date = datetime.datetime.now().strftime("%m/%d/%Y")
json_data = self.comment
if date not in json_data:
json_data[date] = [comment]
else: json_data[date].append(comment)
# with scheduler.lock:
# with open(filename,"w") as f:
# json.dump(json_data,f)
return "Thanks for your comment!"
def get_arxiv_data_by_author(self, author_name):
if author_name in self.profile: return self.profile[author_name]
author_query = author_name.replace(" ", "+")
url = f"http://export.arxiv.org/api/query?search_query=au:{author_query}&start=0&max_results=300" # Adjust max_results if needed
response = requests.get(url)
papers_list = []
if response.status_code == 200:
root = ElementTree.fromstring(response.content)
entries = root.findall('{http://www.w3.org/2005/Atom}entry')
total_papers = 0
data_to_save = []
papers_by_year = {}
for entry in entries:
title = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
published = entry.find('{http://www.w3.org/2005/Atom}published').text.strip()
abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
authors_elements = entry.findall('{http://www.w3.org/2005/Atom}author')
authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in authors_elements]
link = entry.find('{http://www.w3.org/2005/Atom}id').text.strip() # Get the paper link
# Check if the specified author is exactly in the authors list
if author_name in authors:
# Remove the specified author from the coauthors list for display
coauthors = [author for author in authors if author != author_name]
coauthors_str = ", ".join(coauthors)
papers_list.append({
"date": published,
"Title & Abstract": f"{title}; {abstract}",
"coauthors": coauthors_str,
"link": link # Add the paper link to the dictionary
})
authors_elements = entry.findall('{http://www.w3.org/2005/Atom}author')
authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in authors_elements]
if author_name in authors:
# print(author_name)
# print(authors)
total_papers += 1
published_date = entry.find('{http://www.w3.org/2005/Atom}published').text.strip()
date_obj = datetime.datetime.strptime(published_date, '%Y-%m-%dT%H:%M:%SZ')
year = date_obj.year
if year not in papers_by_year:
papers_by_year[year] = []
papers_by_year[year].append(entry)
if total_papers > 40:
for cycle_start in range(min(papers_by_year), max(papers_by_year) + 1, 5):
cycle_end = cycle_start + 4
for year in range(cycle_start, cycle_end + 1):
if year in papers_by_year:
selected_papers = papers_by_year[year][:2]
for paper in selected_papers:
title = paper.find('{http://www.w3.org/2005/Atom}title').text.strip()
abstract = paper.find('{http://www.w3.org/2005/Atom}summary').text.strip()
authors_elements = paper.findall('{http://www.w3.org/2005/Atom}author')
co_authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in authors_elements if author.find('{http://www.w3.org/2005/Atom}name').text != author_name]
papers_list.append({
"Author": author_name,
"Title & Abstract": f"{title}; {abstract}",
"Date Period": f"{year}",
"Cycle": f"{cycle_start}-{cycle_end}",
"Co_author": ", ".join(co_authors)
})
# Trim the list to the 10 most recent papers
papers_list = papers_list[:10]
# Prepare the data dictionary with the author's name as a key
# import pdb
# pdb.set_trace()
personal_info = "; ".join([f"{details['Title & Abstract']}" for details in papers_list])
info = summarize_research_direction(personal_info)
self.profile[author_name] = info
return self.profile[author_name]
else:
return None