PabloVD commited on
Commit
46e28ab
1 Parent(s): 4e65999

Split app.py into two scripts for a better structure.

Browse files
Files changed (2) hide show
  1. app.py +6 -56
  2. rag.py +54 -0
app.py CHANGED
@@ -1,61 +1,11 @@
1
- # AI assistant with a RAG system to query information from the CAMELS cosmological simulations using Langchain
2
  # Author: Pablo Villanueva Domingo
3
 
4
- import gradio as gr
5
- from langchain import hub
6
- from langchain_chroma import Chroma
7
- from langchain_core.output_parsers import StrOutputParser
8
- from langchain_core.runnables import RunnablePassthrough
9
  from langchain_community.embeddings import HuggingFaceInstructEmbeddings
10
- from langchain_text_splitters import RecursiveCharacterTextSplitter
11
  from langchain_mistralai import ChatMistralAI
12
- from langchain_community.document_loaders import WebBaseLoader
13
  from langchain_core.rate_limiters import InMemoryRateLimiter
14
-
15
- # Load documentation from urls
16
- def get_docs():
17
-
18
- # Get urls
19
- urlsfile = open("urls.txt")
20
- urls = urlsfile.readlines()
21
- urls = [url.replace("\n","") for url in urls]
22
- urlsfile.close()
23
-
24
- # Load, chunk and index the contents of the blog.
25
- loader = WebBaseLoader(urls)
26
- docs = loader.load()
27
-
28
- return docs
29
-
30
- # Join content pages for processing
31
- def format_docs(docs):
32
- return "\n\n".join(doc.page_content for doc in docs)
33
-
34
- # Create a RAG chain
35
- def RAG(llm, docs, embeddings):
36
-
37
- # Split text
38
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
39
- splits = text_splitter.split_documents(docs)
40
-
41
- # Create vector store
42
- vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
43
-
44
- # Retrieve and generate using the relevant snippets of the documents
45
- retriever = vectorstore.as_retriever()
46
-
47
- # Prompt basis example for RAG systems
48
- prompt = hub.pull("rlm/rag-prompt")
49
-
50
- # Create the chain
51
- rag_chain = (
52
- {"context": retriever | format_docs, "question": RunnablePassthrough()}
53
- | prompt
54
- | llm
55
- | StrOutputParser()
56
- )
57
-
58
- return rag_chain
59
 
60
  # Define a limiter to avoid rate limit issues with MistralAI
61
  rate_limiter = InMemoryRateLimiter(
@@ -64,8 +14,8 @@ rate_limiter = InMemoryRateLimiter(
64
  max_bucket_size=10, # Controls the maximum burst size.
65
  )
66
 
67
- # Get docs
68
- docs = get_docs()
69
  print("Pages loaded:",len(docs))
70
 
71
  # LLM model
@@ -117,7 +67,7 @@ if __name__=="__main__":
117
  examples=example_questions,
118
  theme=gr.themes.Soft(),
119
  description=description,
120
- cache_examples=False,
121
  chatbot=chatbot)
122
 
123
  demo.launch()
 
1
+ # AI assistant with a RAG system to query information from the CAMELS cosmological simulations using Langchain and deployed with Gradio
2
  # Author: Pablo Villanueva Domingo
3
 
4
+ from rag import RAG, load_docs
 
 
 
 
5
  from langchain_community.embeddings import HuggingFaceInstructEmbeddings
 
6
  from langchain_mistralai import ChatMistralAI
 
7
  from langchain_core.rate_limiters import InMemoryRateLimiter
8
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Define a limiter to avoid rate limit issues with MistralAI
11
  rate_limiter = InMemoryRateLimiter(
 
14
  max_bucket_size=10, # Controls the maximum burst size.
15
  )
16
 
17
+ # Load the documentation
18
+ docs = load_docs()
19
  print("Pages loaded:",len(docs))
20
 
21
  # LLM model
 
67
  examples=example_questions,
68
  theme=gr.themes.Soft(),
69
  description=description,
70
+ #cache_examples=False,
71
  chatbot=chatbot)
72
 
73
  demo.launch()
rag.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utilities to build a RAG system to query information from the CAMELS cosmological simulations using Langchain
2
+ # Author: Pablo Villanueva Domingo
3
+
4
+ from langchain import hub
5
+ from langchain_chroma import Chroma
6
+ from langchain_core.output_parsers import StrOutputParser
7
+ from langchain_core.runnables import RunnablePassthrough
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ from langchain_community.document_loaders import WebBaseLoader
10
+
11
+ # Load documentation from urls
12
+ def load_docs():
13
+
14
+ # Get urls
15
+ urlsfile = open("urls.txt")
16
+ urls = urlsfile.readlines()
17
+ urls = [url.replace("\n","") for url in urls]
18
+ urlsfile.close()
19
+
20
+ # Load, chunk and index the contents of the blog.
21
+ loader = WebBaseLoader(urls)
22
+ docs = loader.load()
23
+
24
+ return docs
25
+
26
+ # Join content pages for processing
27
+ def format_docs(docs):
28
+ return "\n\n".join(doc.page_content for doc in docs)
29
+
30
+ # Create a RAG chain
31
+ def RAG(llm, docs, embeddings):
32
+
33
+ # Split text
34
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
35
+ splits = text_splitter.split_documents(docs)
36
+
37
+ # Create vector store
38
+ vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
39
+
40
+ # Retrieve and generate using the relevant snippets of the documents
41
+ retriever = vectorstore.as_retriever()
42
+
43
+ # Prompt basis example for RAG systems
44
+ prompt = hub.pull("rlm/rag-prompt")
45
+
46
+ # Create the chain
47
+ rag_chain = (
48
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
49
+ | prompt
50
+ | llm
51
+ | StrOutputParser()
52
+ )
53
+
54
+ return rag_chain