as-cle-bert commited on
Commit
234c263
1 Parent(s): 07ce08d

Update pubmedScraper.py

Browse files
Files changed (1) hide show
  1. pubmedScraper.py +50 -26
pubmedScraper.py CHANGED
@@ -1,4 +1,5 @@
1
  from Bio import Entrez
 
2
 
3
  def remove_blankets(ls):
4
  for i in range(len(ls)):
@@ -19,36 +20,59 @@ def search_pubmed(query, max_results, address):
19
 
20
  def fetch_pubmed_details(pubmed_ids, address):
21
  Entrez.email = address # Replace with your email
22
- handle = Entrez.efetch(db="pubmed", id=pubmed_ids, rettype="medline", retmode="text")
23
  records = handle.read()
24
  handle.close()
25
- return records
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  def respond_to_query(query,address,max_results=10):
28
- # Perform the PubMed search
29
  pubmed_ids = search_pubmed(query, max_results,address)
30
-
31
- # Fetch details for the retrieved PubMed IDs
32
  pubmed_details = fetch_pubmed_details(pubmed_ids,address)
 
 
 
 
 
 
 
 
33
 
34
- pubmed_split=pubmed_details.split("\n")
35
- str_container=[]
36
- counter=-1
37
- for i in pubmed_split:
38
- str_container.append({})
39
- counter+=1
40
- if i.startswith("TI"):
41
- str_container[counter].update({"Title (sometimes not complete)": i.replace('TI - ', '')})
42
- if i.startswith("AU - "):
43
- str_container[counter].update({"Author": i.replace('AU - ', '')})
44
- if i.startswith("PHST") and i.endswith("[pubmed]"):
45
- str_container[counter].update({"Published on PubMed on": i.replace('PHST- ', '').replace('[pubmed]','')})
46
- if i.endswith("[doi]") and i.startswith("AID - "):
47
- str_container[counter].update({"doi": f"https://doi.org/{i[6:len(i)-5]}\n"})
48
- results=[]
49
- for j in str_container:
50
- ls=[f"{key}: {j[key]}\n" for key in list(j.keys())]
51
- results.append("".join(ls))
52
- remove_blankets(results)
53
- defstr="".join(results)
54
- return defstr
 
1
  from Bio import Entrez
2
+ import xml.etree.ElementTree as ET
3
 
4
  def remove_blankets(ls):
5
  for i in range(len(ls)):
 
20
 
21
  def fetch_pubmed_details(pubmed_ids, address):
22
  Entrez.email = address # Replace with your email
23
+ handle = Entrez.efetch(db="pubmed", id=pubmed_ids, rettype="medline", retmode="xml")
24
  records = handle.read()
25
  handle.close()
26
+ recs = records.decode("utf-8")
27
+ f = open("articles.xml", "w")
28
+ f.write(recs)
29
+ f.close()
30
+ return "articles.xml"
31
+
32
+ def fetch_xml(xml_file):
33
+ tree = ET.parse(xml_file)
34
+ root = tree.getroot()
35
+
36
+ articles = {}
37
+
38
+ # Iterate over each article and extract title, authors, and DOI
39
+ for article in root.findall('PubmedArticle'):
40
+ # Extract the article title
41
+ title = article.find('.//ArticleTitle').text if article.find('.//ArticleTitle') is not None else "No title"
42
+
43
+ # Extract the authors
44
+ authors = []
45
+ for author in article.findall('.//Author'):
46
+ last_name = author.find('LastName').text if author.find('LastName') is not None else ""
47
+ fore_name = author.find('ForeName').text if author.find('ForeName') is not None else ""
48
+ authors.append(f"{fore_name} {last_name}".strip())
49
+
50
+ # Extract the DOI
51
+ doi = None
52
+ for elocation_id in article.findall('.//ELocationID'):
53
+ if elocation_id.get('EIdType') == 'doi':
54
+ doi = elocation_id.text
55
+ break
56
+ pub_date = article.find('.//PubDate')
57
+ if pub_date is not None:
58
+ year = pub_date.find('Year').text if pub_date.find('Year') is not None else ""
59
+ month = pub_date.find('Month').text if pub_date.find('Month') is not None else ""
60
+ day = pub_date.find('Day').text if pub_date.find('Day') is not None else ""
61
+ publication_date = f"{year}-{month}-{day}".strip("-")
62
+ else:
63
+ publication_date = "No publication date"
64
+ articles.update({doi: {"Title": title, "Authors": authors, "PubDate": publication_date}})
65
+ return articles
66
 
67
  def respond_to_query(query,address,max_results=10):
 
68
  pubmed_ids = search_pubmed(query, max_results,address)
 
 
69
  pubmed_details = fetch_pubmed_details(pubmed_ids,address)
70
+ articles = fetch_xml(pubmed_details)
71
+ final_res = ""
72
+ for doi in articles:
73
+ auths = [f"- <kbd> {author} </kbd>" for author in articles[doi]["Authors"]] if len(articles[doi]["Authors"]) > 0 else ["- <kbd> No authors listed </kbd>",""]
74
+ authorrs = '\n'.join(auths)
75
+ res = f"**Title**: {articles[doi]['Title']}\n**Publication date**: {articles[doi]['PubDate']}\n<details>\n\t<summary><b>Authors</b></summary>\n\n{authorrs}\n\n</details>\n\n**DOI**: [{doi}🔗](https://doi.org/{doi}) \n\n-----------------------\n"
76
+ final_res+=res
77
+ return final_res
78