Spaces:

valeriylo
/

sample_rag

Running

App Files Files Community

sample_rag / app.py

valeriylo

Update app.py

37717fd verified 8 months ago

raw

history blame

5.28 kB

	import os
	import streamlit as st
	from dotenv import load_dotenv
	from PyPDF2 import PdfReader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.chat_models import ChatOpenAI
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from langchain.chat_models.gigachat import GigaChat
	from htmlTemplates import css, bot_template, user_template
	from langchain.llms import HuggingFaceHub, LlamaCpp
	from huggingface_hub import snapshot_download, hf_hub_download


	# from prompts import CONDENSE_QUESTION_PROMPT

	repo_name = "IlyaGusev/saiga_mistral_7b_gguf"
	model_name = "model-q4_K.gguf"

	#snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)


	def get_pdf_text(pdf_docs):
	text = ""
	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text()

	return text


	def get_text_chunks(text):
	text_splitter = CharacterTextSplitter(separator="\n",
	chunk_size=1000, # 1000
	chunk_overlap=200, # 200
	length_function=len
	)
	chunks = text_splitter.split_text(text)

	return chunks


	def get_vectorstore(text_chunks):
	#embeddings = OpenAIEmbeddings()
	#embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
	embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
	#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
	vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)

	return vectorstore


	def get_conversation_chain(vectorstore, model_name):

	# llm = LlamaCpp(model_path=model_name,
	# temperature=0.1,
	# top_k=30,
	# top_p=0.9,
	# streaming=True,
	# n_ctx=2048,
	# n_parts=1,
	# echo=True
	# )

	# llm = ChatOpenAI()

	llm = GigaChat(credentials=os.getenv("GIGACHAT_CREDENTIALS"),
	verify_ssl_certs=False)

	memory = ConversationBufferMemory(memory_key='chat_history',
	input_key='question',
	output_key='answer',
	return_messages=True)

	conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm,
	retriever=vectorstore.as_retriever(),
	memory=memory,
	return_source_documents=True
	)

	return conversation_chain


	def handle_userinput(user_question):

	if user_question == None:
	user_question = "привет"

	response = st.session_state.conversation({'question': user_question})

	st.session_state.chat_history = response['chat_history']

	st.session_state.retrieved_text = response['source_documents']

	for i, (message, text) in enumerate(zip(st.session_state.chat_history, st.session_state.retrieved_text)):
	if i % 3 == 0:
	st.write(user_template.replace(
	"{{MSG}}", message.content), unsafe_allow_html=True)
	else:
	st.write(bot_template.replace(
	"{{MSG}}", message.content), unsafe_allow_html=True)
	print(text)
	st.write(bot_template.replace(
	"{{MSG}}", str(text.page_content)), unsafe_allow_html=True)



	#for text in enumerate(st.session_state.retrieved_text):
	# st.write(text[1].page_content, '\n')

	#print(response['source_documents'][0])

	# main code
	load_dotenv()

	st.set_page_config(page_title="Chat with multiple PDFs",
	page_icon=":books:")
	st.write(css, unsafe_allow_html=True)

	if "conversation" not in st.session_state:
	st.session_state.conversation = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = None

	st.header("Chat with multiple PDFs :books:")
	user_question = st.text_input("Ask a question about your documents: ")

	if user_question:
	handle_userinput(user_question)

	with st.sidebar:
	st.subheader("Your documents")
	pdf_docs = st.file_uploader(
	"Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
	if st.button("Process"):
	with st.spinner("Processing"):
	# get pdf text
	raw_text = get_pdf_text(pdf_docs)

	# get the text chunks
	text_chunks = get_text_chunks(raw_text)

	# create vector store
	vectorstore = get_vectorstore(text_chunks)

	# create conversation chain
	st.session_state.conversation = get_conversation_chain(vectorstore, model_name)