jaiganesan commited on
Commit
e4f9f91
1 Parent(s): 97e8bd1

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -391
app.py DELETED
@@ -1,391 +0,0 @@
1
- import os
2
- import os.path
3
- import serpapi
4
- import requests
5
- import streamlit as st
6
- from typing import List
7
- from docx import Document
8
- from bs4 import BeautifulSoup
9
- import huggingface_hub as hfh
10
- import feedparser
11
- from urllib.parse import quote
12
- from llama_index.llms.openai import OpenAI
13
- from llama_index.core.schema import MetadataMode, NodeWithScore
14
- from langchain_community.document_loaders import WebBaseLoader
15
- from llama_index.embeddings.openai import OpenAIEmbedding
16
- from langchain_community.document_loaders import PyPDFLoader
17
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
18
- from llama_index.postprocessor.cohere_rerank import CohereRerank
19
- from llama_index.core.query_engine import RetrieverQueryEngine
20
- from llama_index.core.query_engine.multistep_query_engine import MultiStepQueryEngine
21
- from llama_index.core.indices.query.query_transform.base import StepDecomposeQueryTransform
22
- from llama_index.core.node_parser import SemanticSplitterNodeParser
23
- from llama_index.core.retrievers import VectorIndexRetriever, KeywordTableSimpleRetriever, BaseRetriever
24
- from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SimilarityPostprocessor
25
- from llama_index.core import (VectorStoreIndex, SimpleDirectoryReader, ServiceContext, load_index_from_storage,
26
- StorageContext, Document, Settings, SimpleKeywordTableIndex,
27
- QueryBundle, get_response_synthesizer)
28
-
29
- import warnings
30
- warnings.filterwarnings("ignore")
31
- def setting_api_key():
32
- try:
33
- if st.session_state.openai_api_key and st.session_state.serp_api_key:
34
- os.environ['OPENAI_API_KEY'] = st.session_state.openai_api_key
35
- st.session_state.hf_token = os.getenv("hf_token")
36
- hfh.login(token=st.session_state.hf_token)
37
- st.session_state.cohere_api_key = os.getenv("cohere_api_key")
38
-
39
- elif not st.session_state.openai_api_key or not st.session_state.serp_api_key:
40
- st.warning("Please set the necessary API keys")
41
- except Exception as e:
42
- st.warning(e)
43
-
44
-
45
- def setup_llm_embed():
46
- template = """<|system|>
47
- Mention Clearly Before response " RAG Output"
48
- Please check if the following pieces of context has any mention of the keywords provided
49
- in the question.Response as much as you could with context you get.
50
- you are Question answering system based AI, Machine Learning , Deep Learning , Generative AI, Data
51
- science and Data Analytics.if the following pieces of Context does not relate to Question,
52
- You must not answer on your own,you don't know the answer.
53
- </s>
54
- <|user|>
55
- Question:{query_str}</s>
56
- <|assistant|> """
57
-
58
- llm = OpenAI(model="gpt-3.5-turbo-0125",
59
- temperature=0.1,
60
- model_kwargs={'trust_remote_code': True},
61
- max_tokens=512,
62
- system_prompt=template)
63
-
64
- # embed_model = OpenAIEmbedding(model="text-embedding-3-small")
65
- # embed_model = OpenAIEmbedding()
66
- embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
67
- return llm, embed_model
68
-
69
-
70
- def semantic_split(embed_model, documents):
71
- sentence_node_parser = SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=90,
72
- embed_model=embed_model)
73
- nodes = sentence_node_parser.get_nodes_from_documents(documents)
74
- return nodes
75
-
76
-
77
- def ctx_vector_func(llm, embed_model, nodes):
78
- # Incorporate Embedding Model and LLM - memory
79
- ctx_vector = ServiceContext.from_defaults(
80
- llm=llm,
81
- embed_model=embed_model,
82
- node_parser=nodes)
83
- return ctx_vector
84
-
85
-
86
- def saving_vectors(vector_index, keyword_index):
87
- vector_index.storage_context.persist(persist_dir="vectors/vector_index/")
88
- keyword_index.storage_context.persist(persist_dir="vectors/keyword_index/")
89
-
90
-
91
- def create_vector_and_keyword_index(nodes, ctx_vector):
92
- vector_index = VectorStoreIndex(nodes, service_context=ctx_vector)
93
- keyword_index = SimpleKeywordTableIndex(nodes, service_context=ctx_vector)
94
- saving_vectors(vector_index, keyword_index)
95
- return vector_index, keyword_index
96
-
97
-
98
- class CustomRetriever(BaseRetriever):
99
- def __init__(
100
- self,
101
- vector_retriever: VectorIndexRetriever,
102
- keyword_retriever: KeywordTableSimpleRetriever,
103
- mode: str = "AND",
104
- ) -> None:
105
-
106
- self._vector_retriever = vector_retriever
107
- self._keyword_retriever = keyword_retriever
108
- if mode not in ("AND", "OR"):
109
- raise ValueError("Invalid mode.")
110
- self._mode = mode
111
- super().__init__()
112
-
113
- def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
114
-
115
- vector_nodes = self._vector_retriever.retrieve(query_bundle)
116
- keyword_nodes = self._keyword_retriever.retrieve(query_bundle)
117
-
118
- vector_ids = {n.node.node_id for n in vector_nodes}
119
- keyword_ids = {n.node.node_id for n in keyword_nodes}
120
-
121
- combined_dict = {n.node.node_id: n for n in vector_nodes}
122
- combined_dict.update({n.node.node_id: n for n in keyword_nodes})
123
-
124
- if self._mode == "AND":
125
- retrieve_ids = vector_ids.intersection(keyword_ids)
126
- else:
127
- retrieve_ids = vector_ids.union(keyword_ids)
128
-
129
- retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
130
- return retrieve_nodes
131
-
132
-
133
- def search_arxiv(query, max_results=8):
134
- encoded_query = quote(query)
135
- base_url = 'http://export.arxiv.org/api/query?'
136
- query_url = f'{base_url}search_query={encoded_query}&start=0&max_results={max_results}'
137
- feed = feedparser.parse(query_url)
138
- papers = []
139
- for entry in feed.entries:
140
- paper_info = {
141
- 'Title': entry.title,
142
- 'URL': entry.link
143
- }
144
- papers.append(paper_info)
145
- return papers
146
-
147
-
148
- def remove_empty_lines(lines):
149
- non_empty_lines = [line for line in lines if line.strip()]
150
- return ' '.join(non_empty_lines)
151
-
152
-
153
- def get_article_and_arxiv_content(query):
154
- # Article content
155
- serpapi_api_key = st.session_state.serp_api_key
156
- search_engine = "google" # bing
157
-
158
- params = {
159
- "engine": "google",
160
- "gl": "us",
161
- "hl": "en",
162
- "api_key": serpapi_api_key,
163
- "q": query
164
- }
165
- serpapi_wrapper = serpapi.GoogleSearch(params)
166
- search_results = serpapi_wrapper.get_dict()
167
- results = []
168
- for result_type in ["organic_results", "related_questions"]:
169
- if result_type in search_results:
170
- for result in search_results[result_type]:
171
- if "title" in result and "link" in result:
172
- # Extract title and link
173
- item = {"title": result["title"], "link": result["link"]}
174
- results.append(item)
175
- # Store Each article links in List
176
- links = [result['link'] for result in results]
177
- titles = [result['title'] for result in results]
178
-
179
- contents = []
180
- i = 0
181
- for link, title in zip(links, titles):
182
-
183
- response = requests.get(link)
184
- soup = BeautifulSoup(response.content, "html.parser")
185
- content_tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
186
- document = ""
187
- for tag in content_tags:
188
- document += tag.text + "\n"
189
-
190
- if not document:
191
- loader = WebBaseLoader(link)
192
- document_ = loader.load()
193
- document = document_[0].page_content
194
- i += 1
195
- if i == 4:
196
- break
197
-
198
- article = remove_empty_lines(document.split('\n')) #
199
- contents.append(article)
200
-
201
- base_url = "http://export.arxiv.org/api/query"
202
- papers_to_download = search_arxiv(query)
203
-
204
- papers_urls = []
205
-
206
- for paper in papers_to_download:
207
- page_url = paper['URL']
208
- response = requests.get(page_url)
209
- soup = BeautifulSoup(response.content, "html.parser")
210
- download_link = soup.find("a", class_="abs-button download-pdf")
211
-
212
- if download_link:
213
-
214
- pdf_url = download_link['href']
215
- if not pdf_url.startswith("http"):
216
- pdf_url = "https://arxiv.org" + pdf_url
217
- papers_urls.append(pdf_url)
218
-
219
- paper_content = []
220
- for url_ in papers_urls[:2]:
221
- loader = PyPDFLoader(url_)
222
- pages = loader.load_and_split()
223
- paper_text = ''
224
- for page in pages:
225
- page_text = remove_empty_lines(page.page_content.split('\n'))
226
- paper_text += page_text
227
-
228
- if paper_text:
229
- paper_content.append(paper_text)
230
-
231
- return contents + paper_content
232
-
233
-
234
- def file_nodes_vector():
235
- with st.spinner("Generating Vector Index..."):
236
- PERSIST_DIR_vector = "vectors/vector_index"
237
- PERSIST_DIR_keyword = "vectors/keyword_index"
238
-
239
- if not os.path.exists(PERSIST_DIR_vector):
240
- os.makedirs(PERSIST_DIR_vector)
241
-
242
- if not os.path.exists(PERSIST_DIR_keyword):
243
- os.makedirs(PERSIST_DIR_keyword)
244
-
245
- try:
246
- storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR_vector)
247
- vector_index = load_index_from_storage(storage_context)
248
- storage_context_ = StorageContext.from_defaults(persist_dir=PERSIST_DIR_keyword)
249
- keyword_index = load_index_from_storage(storage_context_)
250
-
251
- except FileNotFoundError:
252
- documents = SimpleDirectoryReader(input_dir="sample_pdfs/").load_data()
253
-
254
- # LLM and Embedding Model Setup
255
-
256
- llm, embed_model = setup_llm_embed()
257
- Settings.llm = llm
258
- Settings.embed_model = embed_model
259
-
260
- # Splitting Nodes
261
- nodes = semantic_split(embed_model, documents)
262
- ctx_vector = ctx_vector_func(llm, embed_model, nodes)
263
-
264
- # Creating Vector index and Keyword Index
265
-
266
- vector_index, keyword_index = create_vector_and_keyword_index(nodes, ctx_vector)
267
- return vector_index, keyword_index
268
-
269
-
270
- def response_generation(query, cohere_api_key, vector_index, keyword_index):
271
- cohere_rerank = CohereRerank(api_key=cohere_api_key, top_n=4)
272
- postprocessor = SimilarityPostprocessor(similarity_cutoff=0.85) # default 0.80
273
-
274
- sentence_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=8)
275
- keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index, similarity_top_k=8)
276
- custom_retriever = CustomRetriever(sentence_retriever, keyword_retriever)
277
-
278
- response_synthesizer = get_response_synthesizer()
279
- query_engine = RetrieverQueryEngine(retriever=custom_retriever, response_synthesizer=response_synthesizer,
280
- node_postprocessors=[
281
- MetadataReplacementPostProcessor(target_metadata_key="window"),
282
- cohere_rerank, postprocessor])
283
-
284
- # step_decompose_transform = StepDecomposeQueryTransform(llm, verbose=False)
285
- # query_engine = MultiStepQueryEngine(query_engine = query_engine, query_transform=step_decompose_transform )
286
-
287
- response = query_engine.query(query)
288
- return response
289
-
290
-
291
- def stream_output(response):
292
- st.write("""<h1 style="font-size: 20px;">Output From RAG </h1>""", unsafe_allow_html=True)
293
- for char in response:
294
- st.text(char)
295
-
296
-
297
- def func_add_new_article_content(content_):
298
- documents = [Document(text=t) for t in content_]
299
- # LLM and Embedding Model Setup
300
- llm, embed_model = setup_llm_embed()
301
- Settings.llm = llm
302
- Settings.embed_model = embed_model
303
-
304
- # Splitting Nodes
305
- new_nodes = semantic_split(embed_model, documents)
306
- ctx_vector = ctx_vector_func(llm, embed_model, new_nodes) # documents - nodes
307
- new_vector_index, new_keyword_index = create_vector_and_keyword_index(new_nodes, ctx_vector) # documents - nodes
308
- return new_vector_index, new_keyword_index, new_nodes
309
-
310
-
311
- def updating_vector(new_nodes, vector_index, keyword_index):
312
- vector_index.insert_nodes(new_nodes)
313
- keyword_index.insert_nodes(new_nodes)
314
- saving_vectors(vector_index, keyword_index)
315
-
316
-
317
- def main():
318
- st.write("""<h1 style="font-size: 30px;">GenAI Question-Answer System Utilizing Advanced Retrieval-Augmented
319
- Generation (RAG)</h1>""", unsafe_allow_html=True)
320
-
321
- st.markdown("""This application operates on a paid source model and framework to ensure high accuracy and minimize
322
- hallucination. Prior to running the application, it's necessary to configure two keys. Learn more about
323
- these keys and how to generate them below.""")
324
-
325
- st.write("""<h1 style="font-size: 15px;">Enter your OpenAI API key </h1>""", unsafe_allow_html=True)
326
- openai_api_key = st.text_input(placeholder="OpenAI api key ", label=" ", type="password")
327
-
328
- st.write("""<h1 style="font-size: 15px;">Enter your SERP API key </h1>""", unsafe_allow_html=True)
329
- serp_api_key = st.text_input(placeholder="Serp api key ", label=" ", type="password")
330
-
331
- set_keys_button = st.button("Set Keys ", type="primary")
332
- key_flag = False
333
-
334
- try:
335
- if set_keys_button:
336
- if openai_api_key and serp_api_key:
337
- st.session_state.openai_api_key = openai_api_key
338
- st.session_state.serp_api_key = serp_api_key
339
- setting_api_key()
340
- st.success("Successful 👍")
341
- key_flag = True
342
- else:
343
- st.warning("Please set the necessary API keys")
344
- except Exception as e:
345
- st.warning(e)
346
-
347
- st.write("""<h1 style="font-size: 15px;">Enter your Question </h1>""", unsafe_allow_html=True)
348
- query = st.text_input(placeholder="Query ", label=" ", max_chars=192)
349
-
350
- generate_response_button = st.button("Generate response ", type="primary")
351
-
352
- if generate_response_button and key_flag and str(query):
353
- vector_index, keyword_index = file_nodes_vector()
354
- response = response_generation(query, st.session_state.cohere_api_key, vector_index, keyword_index)
355
- if response in ["Empty Response", "RAG Output"] or not response:
356
- with st.spinner("Getting Information from Articles, It will take some time."):
357
- content_ = get_article_and_arxiv_content(query)
358
- new_vector_index, new_keyword_index, new_nodes = func_add_new_article_content(content_)
359
- response = response_generation(query, st.session_state.cohere_api_key, new_vector_index, new_keyword_index)
360
- stream_output(response)
361
-
362
- col1, col2 = st.columns([1, 10])
363
- thumps_up_button = col1.button("👍")
364
- thumps_down_button = col2.button("👎")
365
- if thumps_up_button:
366
- st.write("Thank you for your positive feedback!")
367
- updating_vector(new_nodes, vector_index, keyword_index)
368
- if thumps_down_button:
369
- st.write("""We're sorry , We will improve it.""")
370
-
371
- elif response:
372
- stream_output(response)
373
- col1, col2 = st.columns([1, 10])
374
- if col1.button("👍"):
375
- st.write("Thank you for your positive feedback!")
376
- if col2.button("👎"):
377
- st.write("We're sorry , We will improve it.")
378
-
379
- elif generate_response_button and not str(query) and not key_flag:
380
- st.warning("Please set the necessary API keys and Enter the query")
381
-
382
- elif generate_response_button and str(query) and not key_flag:
383
- st.warning("Please set the necessary API keys")
384
-
385
- elif generate_response_button and key_flag and not str(query):
386
- st.warning("Please Enter the query !")
387
-
388
-
389
- if __name__ == "__main__":
390
- main()
391
-