Spaces:

as-cle-bert
/

BioMedicalPapersBot

Running

App Files Files Community

BioMedicalPapersBot / pubmedScraper.py

as-cle-bert

Update pubmedScraper.py

234c263 verified 20 days ago

raw

history blame

3.23 kB

	from Bio import Entrez
	import xml.etree.ElementTree as ET

	def remove_blankets(ls):
	for i in range(len(ls)):
	if i<len(ls):
	if ls[i]=="" or ls[i]==" ":
	ls.remove(ls[i])
	else:
	pass
	else:
	pass

	def search_pubmed(query, max_results, address):
	Entrez.email = address # Replace with your email
	handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
	record = Entrez.read(handle)
	handle.close()
	return record["IdList"]

	def fetch_pubmed_details(pubmed_ids, address):
	Entrez.email = address # Replace with your email
	handle = Entrez.efetch(db="pubmed", id=pubmed_ids, rettype="medline", retmode="xml")
	records = handle.read()
	handle.close()
	recs = records.decode("utf-8")
	f = open("articles.xml", "w")
	f.write(recs)
	f.close()
	return "articles.xml"

	def fetch_xml(xml_file):
	tree = ET.parse(xml_file)
	root = tree.getroot()

	articles = {}

	# Iterate over each article and extract title, authors, and DOI
	for article in root.findall('PubmedArticle'):
	# Extract the article title
	title = article.find('.//ArticleTitle').text if article.find('.//ArticleTitle') is not None else "No title"

	# Extract the authors
	authors = []
	for author in article.findall('.//Author'):
	last_name = author.find('LastName').text if author.find('LastName') is not None else ""
	fore_name = author.find('ForeName').text if author.find('ForeName') is not None else ""
	authors.append(f"{fore_name} {last_name}".strip())

	# Extract the DOI
	doi = None
	for elocation_id in article.findall('.//ELocationID'):
	if elocation_id.get('EIdType') == 'doi':
	doi = elocation_id.text
	break
	pub_date = article.find('.//PubDate')
	if pub_date is not None:
	year = pub_date.find('Year').text if pub_date.find('Year') is not None else ""
	month = pub_date.find('Month').text if pub_date.find('Month') is not None else ""
	day = pub_date.find('Day').text if pub_date.find('Day') is not None else ""
	publication_date = f"{year}-{month}-{day}".strip("-")
	else:
	publication_date = "No publication date"
	articles.update({doi: {"Title": title, "Authors": authors, "PubDate": publication_date}})
	return articles

	def respond_to_query(query,address,max_results=10):
	pubmed_ids = search_pubmed(query, max_results,address)
	pubmed_details = fetch_pubmed_details(pubmed_ids,address)
	articles = fetch_xml(pubmed_details)
	final_res = ""
	for doi in articles:
	auths = [f"- <kbd> {author} </kbd>" for author in articles[doi]["Authors"]] if len(articles[doi]["Authors"]) > 0 else ["- <kbd> No authors listed </kbd>",""]
	authorrs = '\n'.join(auths)
	res = f"Title: {articles[doi]['Title']}\nPublication date: {articles[doi]['PubDate']}\n<details>\n\t<summary><b>Authors</b></summary>\n\n{authorrs}\n\n</details>\n\nDOI: [{doi}🔗](https://doi.org/{doi}) \n\n-----------------------\n"
	final_res+=res
	return final_res