Spaces:
Runtime error
Runtime error
2001muhammadumair
commited on
Commit
•
2da8c28
1
Parent(s):
8e2a8b6
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import faiss
|
3 |
+
import numpy as np
|
4 |
+
import fitz # PyMuPDF for PDF processing
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
from groq import Groq
|
7 |
+
import gradio as gr
|
8 |
+
import logging
|
9 |
+
import pickle
|
10 |
+
|
11 |
+
# Initialize logging to track events and errors
|
12 |
+
logging.basicConfig(filename='query_logs.log', level=logging.INFO)
|
13 |
+
|
14 |
+
# Securely load GROQ API key from environment variables
|
15 |
+
grog_api_key = "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"
|
16 |
+
if not grog_api_key:
|
17 |
+
raise ValueError("GROQ_API_KEY environment variable not set.")
|
18 |
+
client = Groq(api_key=grog_api_key)
|
19 |
+
|
20 |
+
# Path to the PDF file containing pharmaceutical content
|
21 |
+
book_path = '/content/martins-physical-pharmacy-6th-ed-2011-dr-murtadha-alshareifi.pdf'
|
22 |
+
|
23 |
+
# Function to read and extract text from the PDF
|
24 |
+
def read_pdf(file_path):
|
25 |
+
try:
|
26 |
+
doc = fitz.open(file_path)
|
27 |
+
text_data = []
|
28 |
+
for page_num in range(doc.page_count):
|
29 |
+
page = doc.load_page(page_num)
|
30 |
+
text = page.get_text("text")
|
31 |
+
text_data.append(text)
|
32 |
+
return text_data
|
33 |
+
except Exception as e:
|
34 |
+
logging.error(f"Error reading PDF: {str(e)}")
|
35 |
+
return []
|
36 |
+
|
37 |
+
# Function to split text into paragraphs
|
38 |
+
def split_text_into_paragraphs(text_pages, max_tokens=300):
|
39 |
+
chunks = []
|
40 |
+
for page in text_pages:
|
41 |
+
paragraphs = page.split('\n\n')
|
42 |
+
chunk = ""
|
43 |
+
for para in paragraphs:
|
44 |
+
if len(chunk) + len(para) <= max_tokens:
|
45 |
+
chunk += para + "\n"
|
46 |
+
else:
|
47 |
+
chunks.append(chunk.strip())
|
48 |
+
chunk = para + "\n"
|
49 |
+
if chunk:
|
50 |
+
chunks.append(chunk.strip())
|
51 |
+
return chunks
|
52 |
+
|
53 |
+
# Function to vectorize text chunks and create a FAISS index
|
54 |
+
def vectorize_text(chunks, batch_size=100, save_path="embeddings.pkl"):
|
55 |
+
if os.path.exists(save_path):
|
56 |
+
with open(save_path, "rb") as f:
|
57 |
+
index = pickle.load(f)
|
58 |
+
return index, chunks
|
59 |
+
try:
|
60 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
61 |
+
embeddings = []
|
62 |
+
index = faiss.IndexFlatL2(384)
|
63 |
+
|
64 |
+
for i in range(0, len(chunks), batch_size):
|
65 |
+
chunk_batch = chunks[i:i + batch_size]
|
66 |
+
batch_embeddings = model.encode(chunk_batch, show_progress_bar=True)
|
67 |
+
embeddings.append(batch_embeddings)
|
68 |
+
index.add(np.array(batch_embeddings))
|
69 |
+
|
70 |
+
with open(save_path, "wb") as f:
|
71 |
+
pickle.dump(index, f)
|
72 |
+
return index, chunks
|
73 |
+
except Exception as e:
|
74 |
+
logging.error(f"Error during vectorization: {str(e)}")
|
75 |
+
return None, None
|
76 |
+
|
77 |
+
# Load and vectorize PDF content
|
78 |
+
text_pages = read_pdf(book_path)
|
79 |
+
if not text_pages:
|
80 |
+
raise RuntimeError("Failed to read PDF content. Check logs for details.")
|
81 |
+
|
82 |
+
chunks = split_text_into_paragraphs(text_pages)
|
83 |
+
vector_index, chunks = vectorize_text(chunks)
|
84 |
+
if vector_index is None or chunks is None:
|
85 |
+
raise RuntimeError("Vectorization failed. Check logs for details.")
|
86 |
+
|
87 |
+
# Function to generate query embeddings
|
88 |
+
def generate_query_embedding(query, model):
|
89 |
+
return model.encode([query])
|
90 |
+
|
91 |
+
# Function to check relevancy based on distance threshold
|
92 |
+
def check_relevancy(distances, threshold=1):
|
93 |
+
return distances[0][0] <= threshold
|
94 |
+
|
95 |
+
# System prompt defining the chatbot's attributes and response structure
|
96 |
+
system_prompt = """
|
97 |
+
You are **PharmaExpert Pro**, an advanced chatbot specialized in the field of pharmaceutical sciences. Your responses should be structured, concise, and informative, making complex topics accessible.
|
98 |
+
|
99 |
+
# Response Structure:
|
100 |
+
1. **Overview**: Start with a brief context to set the user’s expectations.
|
101 |
+
2. **Definition**: Clearly define the concept being queried.
|
102 |
+
3. **In-Depth Analysis**: Provide a detailed breakdown of concepts, including:
|
103 |
+
- Examples
|
104 |
+
- Relevant formulas (if applicable)
|
105 |
+
- Learning processes
|
106 |
+
- Working mechanisms
|
107 |
+
- Purpose
|
108 |
+
- Advantages and disadvantages
|
109 |
+
- Role in the broader topic
|
110 |
+
4. **Summary**: Conclude with a short summary of essential takeaways, ensuring clarity and retention.
|
111 |
+
|
112 |
+
# Communication Style:
|
113 |
+
- **Professional yet Accessible**: Keep language rigorous yet clear.
|
114 |
+
- **Concise and Informative**: Avoid excess details while covering the core information.
|
115 |
+
- **Encouraging Exploration**: Foster an environment for follow-up questions.
|
116 |
+
|
117 |
+
# Unique Qualities:
|
118 |
+
1. **Source-Specific Expertise**: Refer only to the provided PDF.
|
119 |
+
2. **Educational Tools**: Use summaries and key points.
|
120 |
+
3. **Adaptability**: Adjust responses based on the user’s expertise level.
|
121 |
+
"""
|
122 |
+
|
123 |
+
# Function to generate a single, comprehensive answer
|
124 |
+
def generate_answer(query):
|
125 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
126 |
+
query_embedding = generate_query_embedding(query, model)
|
127 |
+
D, I = vector_index.search(np.array(query_embedding), k=5)
|
128 |
+
|
129 |
+
if check_relevancy(D):
|
130 |
+
relevant_chunks = [chunks[i] for i in I[0]]
|
131 |
+
combined_text = " ".join(relevant_chunks)
|
132 |
+
|
133 |
+
user_prompt = f"The user has inquired about a complex pharmaceutical topic. Query: {query}"
|
134 |
+
|
135 |
+
assistant_prompt = f"""
|
136 |
+
Using the following context from the pharmacy PDF, respond with structured detail. **Avoid external citations in your answer.**
|
137 |
+
|
138 |
+
**Context:**
|
139 |
+
{combined_text}
|
140 |
+
|
141 |
+
**User's question:**
|
142 |
+
{query}
|
143 |
+
|
144 |
+
**Response Structure:**
|
145 |
+
|
146 |
+
- **Concept Overview**
|
147 |
+
- **Contextual Relevance**
|
148 |
+
- **Overview of the Concept**
|
149 |
+
- **Definition**
|
150 |
+
- **Foundations**
|
151 |
+
- **Examples** (including relevant case studies)
|
152 |
+
- **Formulas** (if available)
|
153 |
+
- **Key Terms and Definitions**
|
154 |
+
- **Key Vocabulary**
|
155 |
+
- **Historical Context**
|
156 |
+
- **Applications and Practical Uses**
|
157 |
+
- **Step-by-Step Explanation** of processes or calculations
|
158 |
+
- **Visual Aids** (suggestions for diagrams or graphs)
|
159 |
+
- **Visual Aids Explanation**
|
160 |
+
- **Purpose and Significance**
|
161 |
+
- **Common Misconceptions**
|
162 |
+
- **Key Challenges and Controversies** in the field
|
163 |
+
- **Practical Exercises**
|
164 |
+
- **Comparative Analysis**
|
165 |
+
- **Future Implications**
|
166 |
+
- **Future Directions** or potential advancements
|
167 |
+
- **Cultural Context**
|
168 |
+
- **Fun Activities**
|
169 |
+
- **Quiz Questions** 7 quiz
|
170 |
+
- **Step-by-Step Guide**
|
171 |
+
- **Interactive Elements**
|
172 |
+
- **Summative Table** for quick reference
|
173 |
+
- **Summative Review**
|
174 |
+
- **Final Summary**
|
175 |
+
- **Summary**
|
176 |
+
"""
|
177 |
+
|
178 |
+
# **Response Structure:**
|
179 |
+
# - **Overview of the concept**
|
180 |
+
# - **Definition**
|
181 |
+
# - **Examples** (including relevant case studies)
|
182 |
+
# - **Formulas** (if available)
|
183 |
+
# - **Key Terms and Definitions**
|
184 |
+
# - **Historical Context**
|
185 |
+
# - **Applications and Practical Uses**
|
186 |
+
# - **Step-by-Step Explanation** of processes or calculations
|
187 |
+
# - **Visual Aids** (suggestions for diagrams or graphs)
|
188 |
+
# - **Purpose and significance**
|
189 |
+
# - **Common Misconceptions**
|
190 |
+
# - **Key Challenges and Controversies** in the field
|
191 |
+
# - **Future Directions** or potential advancements
|
192 |
+
# - **Summative Table** for quick reference
|
193 |
+
# - **Final Summary**
|
194 |
+
# """
|
195 |
+
|
196 |
+
|
197 |
+
prompt = system_prompt + "\n\n" + user_prompt + "\n\n" + assistant_prompt
|
198 |
+
|
199 |
+
response = client.chat.completions.create(
|
200 |
+
messages=[{"role": "user", "content": prompt}],
|
201 |
+
model="llama3-8b-8192",
|
202 |
+
temperature=0.7,
|
203 |
+
top_p=0.9,
|
204 |
+
|
205 |
+
)
|
206 |
+
answer = response.choices[0].message.content.strip()
|
207 |
+
return answer
|
208 |
+
else:
|
209 |
+
fallback_prompt = f"The user's question is outside the scope of the PDF content. Provide a general answer without referencing external sources."
|
210 |
+
fallback_response = client.chat.completions.create(
|
211 |
+
messages=[{"role": "user", "content": fallback_prompt}],
|
212 |
+
model="llama3-8b-8192",
|
213 |
+
temperature=0.7,
|
214 |
+
top_p=0.9
|
215 |
+
)
|
216 |
+
return fallback_response.choices[0].message.content.strip()
|
217 |
+
|
218 |
+
# Gradio app interface function
|
219 |
+
def gradio_interface(user_query):
|
220 |
+
if user_query.strip() == "":
|
221 |
+
welcome_message = "Welcome to **Physical Pharmacy Book**! Ask me anything related to pharmaceutical sciences."
|
222 |
+
return welcome_message
|
223 |
+
response = generate_answer(user_query)
|
224 |
+
return response
|
225 |
+
|
226 |
+
# Gradio interface setup
|
227 |
+
with gr.Blocks(css=".footer {display: none;}") as iface:
|
228 |
+
gr.Markdown(
|
229 |
+
"""
|
230 |
+
<h1 style='text-align: center; color: #4CAF50;'>PharmaExpert Pro</h1>
|
231 |
+
<p style='text-align: center; font-size: 18px; color: #333;'>
|
232 |
+
Your advanced chatbot for pharmaceutical sciences expertise!
|
233 |
+
</p>
|
234 |
+
""",
|
235 |
+
elem_id="header"
|
236 |
+
)
|
237 |
+
chatbot = gr.Chatbot(type="messages", elem_id="chatbot")
|
238 |
+
msg = gr.Textbox(label="Enter your query", placeholder="Type your question here...", lines=2, max_lines=5)
|
239 |
+
submit_btn = gr.Button("Submit", elem_id="submit-btn")
|
240 |
+
|
241 |
+
def respond(message, chat_history):
|
242 |
+
chat_history.append({"role": "user", "content": message})
|
243 |
+
response = generate_answer(message)
|
244 |
+
chat_history.append({"role": "assistant", "content": response})
|
245 |
+
return "", chat_history
|
246 |
+
|
247 |
+
msg.submit(respond, [msg, chatbot], [msg, chatbot])
|
248 |
+
submit_btn.click(respond, [msg, chatbot], [msg, chatbot])
|
249 |
+
|
250 |
+
# Launch the Gradio app
|
251 |
+
if __name__ == "__main__":
|
252 |
+
iface.launch()
|