alvinhenrick commited on
Commit
a11089e
1 Parent(s): 3756ff7
Files changed (2) hide show
  1. app.py +1 -1
  2. medirag/core/reader.py +19 -10
app.py CHANGED
@@ -21,7 +21,7 @@ dspy.settings.configure(lm=turbo, rm=rm)
21
 
22
  rag = RAG(k=5)
23
  sm = SemanticCaching(model_name='sentence-transformers/all-mpnet-base-v2', dimension=768,
24
- json_file='rag_test_cache.json', cosine_threshold=.85, rag=rag)
25
  sm.load_cache()
26
 
27
 
 
21
 
22
  rag = RAG(k=5)
23
  sm = SemanticCaching(model_name='sentence-transformers/all-mpnet-base-v2', dimension=768,
24
+ json_file='rag_test_cache.json', cosine_threshold=.90, rag=rag)
25
  sm.load_cache()
26
 
27
 
medirag/core/reader.py CHANGED
@@ -1,11 +1,20 @@
1
  import zipfile
2
-
3
  from bs4 import BeautifulSoup
4
  from llama_index.core import Document
5
  from llama_index.core.readers.base import BaseReader
6
 
7
 
 
 
 
 
 
 
 
 
8
  def format_output_string(drug_name, sections_data):
 
9
  output = [f"Drug Name: {drug_name}"]
10
 
11
  for title, paragraphs in sections_data.items():
@@ -27,7 +36,10 @@ def parse_drug_information(soup, extra_info=None):
27
  if not set_id:
28
  return None
29
 
 
30
  structured_body = soup.find("structuredBody")
 
 
31
 
32
  # Extract the drug name
33
  drug_name = None
@@ -50,27 +62,24 @@ def parse_drug_information(soup, extra_info=None):
50
  sections = component.find_all("section")
51
  for section in sections:
52
  title_tag = section.find("title")
53
- if title_tag:
54
- title_text = title_tag.get_text(strip=True)
55
- else:
56
  continue # Skip if title is not found
57
 
58
  paragraphs = section.find_all("paragraph")
59
  paragraphs_text = []
60
  seen_paragraphs = set() # Set to track unique paragraphs
 
61
  for paragraph in paragraphs:
62
- paragraph_text = paragraph.get_text(strip=True)
63
- if paragraph_text and paragraph_text.strip() and paragraph_text not in seen_paragraphs:
64
  paragraphs_text.append(paragraph_text)
65
  seen_paragraphs.add(paragraph_text)
66
 
67
  # Only include sections with non-empty, non-duplicate paragraphs
68
  if paragraphs_text:
69
  if title_text in sections_data:
70
- existing_paragraphs = set(sections_data[title_text])
71
- # Add only unique paragraphs that aren't already in the title's list
72
- unique_paragraphs = [p for p in paragraphs_text if p not in existing_paragraphs]
73
- sections_data[title_text].extend(unique_paragraphs)
74
  else:
75
  sections_data[title_text] = paragraphs_text
76
 
 
1
  import zipfile
2
+ import re
3
  from bs4 import BeautifulSoup
4
  from llama_index.core import Document
5
  from llama_index.core.readers.base import BaseReader
6
 
7
 
8
+ def normalize_text(text):
9
+ """Normalize the text by lowercasing, removing extra spaces, and stripping unnecessary characters."""
10
+ text = text.lower() # Lowercase the text
11
+ text = re.sub(r'\s+', ' ', text) # Replace multiple spaces/newlines with a single space
12
+ text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
13
+ return text.strip()
14
+
15
+
16
  def format_output_string(drug_name, sections_data):
17
+ """Format the output string for document embedding."""
18
  output = [f"Drug Name: {drug_name}"]
19
 
20
  for title, paragraphs in sections_data.items():
 
36
  if not set_id:
37
  return None
38
 
39
+ # Ensure structured body exists
40
  structured_body = soup.find("structuredBody")
41
+ if not structured_body:
42
+ return None
43
 
44
  # Extract the drug name
45
  drug_name = None
 
62
  sections = component.find_all("section")
63
  for section in sections:
64
  title_tag = section.find("title")
65
+ title_text = normalize_text(title_tag.get_text(strip=True)) if title_tag else None
66
+ if not title_text:
 
67
  continue # Skip if title is not found
68
 
69
  paragraphs = section.find_all("paragraph")
70
  paragraphs_text = []
71
  seen_paragraphs = set() # Set to track unique paragraphs
72
+
73
  for paragraph in paragraphs:
74
+ paragraph_text = normalize_text(paragraph.get_text(strip=True))
75
+ if paragraph_text and paragraph_text not in seen_paragraphs:
76
  paragraphs_text.append(paragraph_text)
77
  seen_paragraphs.add(paragraph_text)
78
 
79
  # Only include sections with non-empty, non-duplicate paragraphs
80
  if paragraphs_text:
81
  if title_text in sections_data:
82
+ sections_data[title_text].extend(paragraphs_text)
 
 
 
83
  else:
84
  sections_data[title_text] = paragraphs_text
85