Spaces:
Running
Running
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.chains import RetrievalQAWithSourcesChain | |
from langchain.memory import ConversationBufferWindowMemory | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.chat_models import ChatOpenAI | |
from langchain.prompts.chat import ( | |
ChatPromptTemplate, | |
SystemMessagePromptTemplate, | |
HumanMessagePromptTemplate, | |
) | |
from langchain.document_loaders import PyPDFLoader | |
import os | |
import chainlit as cl | |
from langchain.prompts import PromptTemplate | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
system_template = """Use the following pieces of context to answer the users question. | |
If you don't know the answer, just say that you don't know, don't try to make up an answer. | |
ALWAYS return a "SOURCES" part in your answer. | |
The "SOURCES" part should be a reference to the source of the document from which you got your answer. | |
Example of your response should be: | |
``` | |
The answer is foo | |
SOURCES: xyz | |
``` | |
Begin! | |
---------------- | |
{summaries}""" | |
messages = [ | |
SystemMessagePromptTemplate.from_template(system_template), | |
HumanMessagePromptTemplate.from_template("{question}"), | |
] | |
prompt = ChatPromptTemplate.from_messages(messages) | |
chain_type_kwargs = {"prompt": prompt} | |
async def start(): | |
await cl.Avatar( | |
name="ChatPDF", | |
url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4", | |
# path = r'assets/ChatPDFAvatar.jpg' | |
).send() | |
async def init(): | |
files = None | |
# Wait for the user to upload a file | |
while files == None: | |
files = await cl.AskFileMessage( | |
content="Hey, Welcome to ChatPDF!\n\nChatPDF is a smart, user-friendly tool that integrates state-of-the-art AI models with text extraction and embedding capabilities to create a unique, conversational interaction with your PDF documents.\n\nSimply upload your PDF, ask your questions, and ChatPDF will deliver the most relevant answers directly from your document.\n\nPlease upload a PDF file to begin!", accept=["application/pdf"] | |
).send() | |
file = files[0] | |
msg = cl.Message(content=f'''Processing "{file.name}"...''') | |
await msg.send() | |
# | |
with open(os.path.join(file.name), "wb") as f: | |
f.write(file.content) | |
print(file.name) | |
loader = PyPDFLoader(file.name) | |
pages = loader.load_and_split() | |
# add page split info | |
# Initialize a dictionary to keep track of duplicate page numbers | |
page_counts = {} | |
for document in pages: | |
page_number = document.metadata['page'] | |
# If this is the first occurrence of this page number, initialize its count to 1 | |
# Otherwise, increment the count for this page number | |
page_counts[page_number] = page_counts.get(page_number, 0) + 1 | |
# Create the page split info string | |
page_split_info = f"Page-{page_number+1}.{page_counts[page_number]}" | |
# Add the page split info to the document's metadata | |
document.metadata['page_split_info'] = page_split_info | |
# Create a Chroma vector store | |
embeddings = OpenAIEmbeddings() | |
docsearch = await cl.make_async(Chroma.from_documents)( | |
pages, embeddings | |
) | |
# define memory | |
memory = ConversationBufferWindowMemory( | |
k=5, | |
memory_key='chat_history', | |
return_messages=True, | |
output_key='answer' | |
) | |
# Create a chain that uses the Chroma vector store | |
chain = ConversationalRetrievalChain.from_llm( | |
ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k", streaming=True), | |
chain_type="stuff", | |
retriever=docsearch.as_retriever(search_kwargs={'k':5}), | |
memory=memory, | |
return_source_documents=True, | |
) | |
# Save the metadata and texts in the user session | |
# cl.user_session.set("metadatas", metadatas) | |
cl.user_session.set("texts", pages) | |
# Let the user know that the system is ready | |
await msg.update(content=f''' "{file.name}" processed. You can now ask questions!''') | |
return chain | |
async def process_response(res): | |
answer = res["answer"] | |
source_documents = res['source_documents'] | |
content = [source_documents[i].page_content for i in range(len(source_documents))] | |
name = [source_documents[i].metadata['page_split_info'] for i in range(len(source_documents))] | |
source_elements = [ | |
cl.Text(content=content[i], name=name[i]) for i in range(len(source_documents)) | |
] | |
if source_documents: | |
answer += f"\n\nSources: {', '.join([source_documents[i].metadata['page_split_info'] for i in range(len(source_documents))])}" | |
else: | |
answer += "\n\nNo sources found" | |
await cl.Message(content=answer, elements=source_elements).send() | |
# await cl.Message(content=answer).send() | |