jet-taekyo commited on
Commit
86aaf41
1 Parent(s): 45fec1b

modify text-splliting process

Browse files
app.py CHANGED
@@ -6,6 +6,8 @@ from langchain_core.vectorstores import VectorStoreRetriever
6
  from langchain_openai import ChatOpenAI
7
  from chainlit.types import AskFileResponse
8
  from langchain_openai.embeddings import OpenAIEmbeddings
 
 
9
 
10
  # Libraries to be used
11
  from langchain_community.document_loaders.text import TextLoader
@@ -15,7 +17,8 @@ from langchain_core.prompts import ChatPromptTemplate
15
  from langchain_wrappers.langchain_chat_models import MyChatOpenAI
16
  from langchain_wrappers.langchain_embedding_models import MyOpenAIEmbeddings
17
  from langchain_qdrant import QdrantVectorStore
18
- from langchain_core.runnables import RunnablePassthrough, RunnableParallel
 
19
  import chainlit as cl
20
  from dotenv import load_dotenv
21
 
@@ -24,26 +27,12 @@ from langchain.globals import set_llm_cache, get_llm_cache
24
  from langchain_community.cache import InMemoryCache
25
  set_llm_cache(InMemoryCache())
26
 
27
- system_template = """\
28
- Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer.\
29
-
30
- Context:
31
- {context}
32
- """
33
- human_template = """\
34
- Question:
35
- {question}
36
- """
37
- system_msg = ('system', system_template)
38
- user_msg = ('human', human_template)
39
-
40
- text_splitter = RecursiveCharacterTextSplitter()
41
-
42
 
 
43
  load_dotenv()
44
 
45
- ### RAG chain
46
- def Get_RAG_pipeline(retriever: VectorStoreRetriever, llm: ChatOpenAI):
47
 
48
  retriever = retriever.with_config({'run_name': 'RAG: Retriever'})
49
  prompt = ChatPromptTemplate([system_msg, user_msg]).with_config({'run_name': 'RAG Step2: Prompt (Augmented)'})
@@ -66,25 +55,41 @@ def Get_RAG_pipeline(retriever: VectorStoreRetriever, llm: ChatOpenAI):
66
 
67
  return RAG_chain
68
 
69
-
70
- def process_text_file(file: AskFileResponse):
71
  import tempfile
72
 
73
- if file.name.endswith('.pdf'):
74
- print("PDF file detected")
75
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".pdf") as temp_file:
76
- temp_file_path = temp_file.name
77
- with open(temp_file_path, "wb") as f:
78
- f.write(file.content)
79
- document_loader = PyPDFLoader(temp_file_path)
80
- elif file.name.endswith('.txt'):
81
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
82
- temp_file_path = temp_file.name
83
- with open(temp_file_path, "wb") as f:
84
- f.write(file.content)
85
- document_loader = TextLoader(temp_file_path, autodetect_encoding=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  documents = document_loader.load()
 
88
  splitted_documents = [x.page_content for x in text_splitter.transform_documents(documents)]
89
 
90
  return splitted_documents
 
6
  from langchain_openai import ChatOpenAI
7
  from chainlit.types import AskFileResponse
8
  from langchain_openai.embeddings import OpenAIEmbeddings
9
+ from langchain_core.runnables import Runnable
10
+ from langchain_core.documents import Document
11
 
12
  # Libraries to be used
13
  from langchain_community.document_loaders.text import TextLoader
 
17
  from langchain_wrappers.langchain_chat_models import MyChatOpenAI
18
  from langchain_wrappers.langchain_embedding_models import MyOpenAIEmbeddings
19
  from langchain_qdrant import QdrantVectorStore
20
+ from langchain_core.runnables import RunnablePassthrough, RunnableParallel, Runnable
21
+ from rag_prompts import system_msg, user_msg
22
  import chainlit as cl
23
  from dotenv import load_dotenv
24
 
 
27
  from langchain_community.cache import InMemoryCache
28
  set_llm_cache(InMemoryCache())
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # Load the environment variables
32
  load_dotenv()
33
 
34
+ # RAG chain
35
+ def Get_RAG_pipeline(retriever: VectorStoreRetriever, llm: ChatOpenAI)-> Runnable:
36
 
37
  retriever = retriever.with_config({'run_name': 'RAG: Retriever'})
38
  prompt = ChatPromptTemplate([system_msg, user_msg]).with_config({'run_name': 'RAG Step2: Prompt (Augmented)'})
 
55
 
56
  return RAG_chain
57
 
58
+ # Split documents
59
+ def process_text_file(file: AskFileResponse)-> List[Document]:
60
  import tempfile
61
 
62
+ if file.name.endswith('.txt'):
63
+ suffix = '.txt'
64
+ base_loader = TextLoader
65
+ elif file.name.endswith('.pdf'):
66
+ suffix = '.pdf'
67
+ base_loader = PyPDFLoader
68
+
69
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=suffix) as temp_file:
70
+ temp_file_path = temp_file.name
71
+ with open(temp_file_path, 'wb') as f:
72
+ f.write(file.content)
73
+ document_loader = base_loader(temp_file_path)
74
+
75
+
76
+
77
+
78
+ # if file.name.endswith('.pdf'):
79
+ # with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".pdf") as temp_file:
80
+ # temp_file_path = temp_file.name
81
+ # with open(temp_file_path, "wb") as f:
82
+ # f.write(file.content)
83
+ # document_loader = PyPDFLoader(temp_file_path)
84
+ # elif file.name.endswith('.txt'):
85
+ # with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
86
+ # temp_file_path = temp_file.name
87
+ # with open(temp_file_path, "wb") as f:
88
+ # f.write(file.content)
89
+ # document_loader = TextLoader(temp_file_path, autodetect_encoding=True)
90
 
91
  documents = document_loader.load()
92
+ text_splitter = RecursiveCharacterTextSplitter()
93
  splitted_documents = [x.page_content for x in text_splitter.transform_documents(documents)]
94
 
95
  return splitted_documents
langchain_wrappers/langchain_chat_models.py CHANGED
@@ -4,10 +4,10 @@ from typing import Optional
4
  from langchain_openai import ChatOpenAI
5
 
6
 
7
- # import inspect
8
- # load_dotenv(os.path.join(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) , '.env'))
9
 
10
- load_dotenv()
11
  class MyChatOpenAI:
12
  @classmethod
13
  def from_model(
 
4
  from langchain_openai import ChatOpenAI
5
 
6
 
7
+ import inspect
8
+ load_dotenv(os.path.join(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) , '.env'))
9
 
10
+ # load_dotenv()
11
  class MyChatOpenAI:
12
  @classmethod
13
  def from_model(
langchain_wrappers/langchain_embedding_models.py CHANGED
@@ -3,10 +3,10 @@ from dotenv import load_dotenv
3
  from typing import Optional
4
  from langchain_openai.embeddings import OpenAIEmbeddings
5
 
6
- # import inspect
7
- # load_dotenv(os.path.join(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) , '.env'))
8
 
9
- load_dotenv()
10
  class MyOpenAIEmbeddings:
11
  @classmethod
12
  def from_model(
 
3
  from typing import Optional
4
  from langchain_openai.embeddings import OpenAIEmbeddings
5
 
6
+ import inspect
7
+ load_dotenv(os.path.join(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) , '.env'))
8
 
9
+ # load_dotenv()
10
  class MyOpenAIEmbeddings:
11
  @classmethod
12
  def from_model(
rag_prompts.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ system_template = """\
2
+ Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer.\
3
+
4
+ Context:
5
+ {context}
6
+ """
7
+ system_msg = ('system', system_template)
8
+
9
+
10
+ human_template = """\
11
+ Question:
12
+ {question}
13
+ """
14
+ user_msg = ('human', human_template)