Spaces:
Running
Running
alvinhenrick
commited on
Commit
•
a11089e
1
Parent(s):
3756ff7
clean up
Browse files- app.py +1 -1
- medirag/core/reader.py +19 -10
app.py
CHANGED
@@ -21,7 +21,7 @@ dspy.settings.configure(lm=turbo, rm=rm)
|
|
21 |
|
22 |
rag = RAG(k=5)
|
23 |
sm = SemanticCaching(model_name='sentence-transformers/all-mpnet-base-v2', dimension=768,
|
24 |
-
json_file='rag_test_cache.json', cosine_threshold=.
|
25 |
sm.load_cache()
|
26 |
|
27 |
|
|
|
21 |
|
22 |
rag = RAG(k=5)
|
23 |
sm = SemanticCaching(model_name='sentence-transformers/all-mpnet-base-v2', dimension=768,
|
24 |
+
json_file='rag_test_cache.json', cosine_threshold=.90, rag=rag)
|
25 |
sm.load_cache()
|
26 |
|
27 |
|
medirag/core/reader.py
CHANGED
@@ -1,11 +1,20 @@
|
|
1 |
import zipfile
|
2 |
-
|
3 |
from bs4 import BeautifulSoup
|
4 |
from llama_index.core import Document
|
5 |
from llama_index.core.readers.base import BaseReader
|
6 |
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def format_output_string(drug_name, sections_data):
|
|
|
9 |
output = [f"Drug Name: {drug_name}"]
|
10 |
|
11 |
for title, paragraphs in sections_data.items():
|
@@ -27,7 +36,10 @@ def parse_drug_information(soup, extra_info=None):
|
|
27 |
if not set_id:
|
28 |
return None
|
29 |
|
|
|
30 |
structured_body = soup.find("structuredBody")
|
|
|
|
|
31 |
|
32 |
# Extract the drug name
|
33 |
drug_name = None
|
@@ -50,27 +62,24 @@ def parse_drug_information(soup, extra_info=None):
|
|
50 |
sections = component.find_all("section")
|
51 |
for section in sections:
|
52 |
title_tag = section.find("title")
|
53 |
-
if title_tag
|
54 |
-
|
55 |
-
else:
|
56 |
continue # Skip if title is not found
|
57 |
|
58 |
paragraphs = section.find_all("paragraph")
|
59 |
paragraphs_text = []
|
60 |
seen_paragraphs = set() # Set to track unique paragraphs
|
|
|
61 |
for paragraph in paragraphs:
|
62 |
-
paragraph_text = paragraph.get_text(strip=True)
|
63 |
-
if paragraph_text and paragraph_text
|
64 |
paragraphs_text.append(paragraph_text)
|
65 |
seen_paragraphs.add(paragraph_text)
|
66 |
|
67 |
# Only include sections with non-empty, non-duplicate paragraphs
|
68 |
if paragraphs_text:
|
69 |
if title_text in sections_data:
|
70 |
-
|
71 |
-
# Add only unique paragraphs that aren't already in the title's list
|
72 |
-
unique_paragraphs = [p for p in paragraphs_text if p not in existing_paragraphs]
|
73 |
-
sections_data[title_text].extend(unique_paragraphs)
|
74 |
else:
|
75 |
sections_data[title_text] = paragraphs_text
|
76 |
|
|
|
1 |
import zipfile
|
2 |
+
import re
|
3 |
from bs4 import BeautifulSoup
|
4 |
from llama_index.core import Document
|
5 |
from llama_index.core.readers.base import BaseReader
|
6 |
|
7 |
|
8 |
+
def normalize_text(text):
|
9 |
+
"""Normalize the text by lowercasing, removing extra spaces, and stripping unnecessary characters."""
|
10 |
+
text = text.lower() # Lowercase the text
|
11 |
+
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces/newlines with a single space
|
12 |
+
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
|
13 |
+
return text.strip()
|
14 |
+
|
15 |
+
|
16 |
def format_output_string(drug_name, sections_data):
|
17 |
+
"""Format the output string for document embedding."""
|
18 |
output = [f"Drug Name: {drug_name}"]
|
19 |
|
20 |
for title, paragraphs in sections_data.items():
|
|
|
36 |
if not set_id:
|
37 |
return None
|
38 |
|
39 |
+
# Ensure structured body exists
|
40 |
structured_body = soup.find("structuredBody")
|
41 |
+
if not structured_body:
|
42 |
+
return None
|
43 |
|
44 |
# Extract the drug name
|
45 |
drug_name = None
|
|
|
62 |
sections = component.find_all("section")
|
63 |
for section in sections:
|
64 |
title_tag = section.find("title")
|
65 |
+
title_text = normalize_text(title_tag.get_text(strip=True)) if title_tag else None
|
66 |
+
if not title_text:
|
|
|
67 |
continue # Skip if title is not found
|
68 |
|
69 |
paragraphs = section.find_all("paragraph")
|
70 |
paragraphs_text = []
|
71 |
seen_paragraphs = set() # Set to track unique paragraphs
|
72 |
+
|
73 |
for paragraph in paragraphs:
|
74 |
+
paragraph_text = normalize_text(paragraph.get_text(strip=True))
|
75 |
+
if paragraph_text and paragraph_text not in seen_paragraphs:
|
76 |
paragraphs_text.append(paragraph_text)
|
77 |
seen_paragraphs.add(paragraph_text)
|
78 |
|
79 |
# Only include sections with non-empty, non-duplicate paragraphs
|
80 |
if paragraphs_text:
|
81 |
if title_text in sections_data:
|
82 |
+
sections_data[title_text].extend(paragraphs_text)
|
|
|
|
|
|
|
83 |
else:
|
84 |
sections_data[title_text] = paragraphs_text
|
85 |
|