import streamlit as st from PyPDF2 import PdfFileReader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import GooglePalmEmbeddings from langchain.llms import GooglePalm from langchain.vectorstores import FAISS from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory import os from langchain.document_loaders import UnstructuredURLLoader import tempfile # Set your Google API Key here os.environ['GOOGLE_API_KEY'] = 'YOUR_API_KEY' direct_links = [ "https://zollege.in/exams/comedk-uget", "https://zollege.in/exams/comedk-uget/cutoff", 'https://www.iimrohtak.ac.in/panel/assets/images/prospectus/16881265522765.pdf', 'https://www.iimrohtak.ac.in/panel/assets/images/lor/16884755042121.pdf', 'https://www.iimrohtak.ac.in/dpm.php', 'https://www.iimrohtak.ac.in/dpm-admission.php' 'https://www.iimrohtak.ac.in/areas-of-specialisation.php', 'https://www.iimrohtak.ac.in/financial-assistance.php', 'https://www.iimrohtak.ac.in/panel/assets/images/prospectus/16903487969776.pdf', 'https://www.iimrohtak.ac.in/faqs-for-dpm.php', 'https://www.iimrohtak.ac.in/dpm-student.php', 'https://www.iimrohtak.ac.in/publication.php', 'https://www.iimrohtak.ac.in/dpm-contact.php', 'https://www.iimrohtak.ac.in/ipm.php' ] # Example PDF files (replace with your own file paths) pdf_files = ["sample.pdf", "sample2.pdf"] def get_data(direct_links, pdf_files): direct_link_loader = UnstructuredURLLoader(urls=direct_links) direct_link_data = direct_link_loader.load() pdf_data = get_pdf_text(pdf_files) return direct_link_data + pdf_data def get_pdf_text(pdf_files): text = "" for pdf_file in pdf_files: with open(pdf_file, "rb") as file: pdf_reader = PdfFileReader(file) for page_num in range(pdf_reader.numPages): text += pdf_reader.getPage(page_num).extractText() return text def get_text_chunks(text): text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20) chunks = text_splitter.split_text(text) return chunks def get_vector_store(text_chunks): embeddings = GooglePalmEmbeddings() vector_store = FAISS.from_texts(text_chunks, embedding=embeddings) return vector_store def get_conversational_chain(vector_store): llm = GooglePalm() memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vector_store.as_retriever(), memory=memory) return conversation_chain def user_input(user_question): response = st.session_state.conversation({'question': user_question}) st.session_state.chatHistory = response['chat_history'] for i, message in enumerate(st.session_state.chatHistory): if i % 2 == 0: st.write("Human: ", message.content) else: st.write("Bot: ", message.content) def main(): st.set_page_config("Chat with Multiple PDFs") st.header("Chat with Multiple PDF 💬") user_question = st.text_input("Ask a Question from the PDF Files") if "conversation" not in st.session_state: st.session_state.conversation = None if "chatHistory" not in st.session_state: st.session_state.chatHistory = None if user_question: user_input(user_question) with st.sidebar: st.title("Settings") st.subheader("Upload your Documents") pdf_uploads = st.file_uploader("Upload your PDF Files and Click on the Process Button", accept_multiple_files=True) if st.button("Process"): with st.spinner("Processing"): pdf_files = [tempfile.NamedTemporaryFile(delete=False).name for _ in pdf_uploads] for uploaded_file, pdf_file in zip(pdf_uploads, pdf_files): with open(pdf_file, "wb") as file: file.write(uploaded_file.read()) combined_text = get_data(direct_links, pdf_files) text_chunks = get_text_chunks(combined_text) vector_store = get_vector_store(text_chunks) st.session_state.conversation = get_conversational_chain(vector_store) st.success("Done") if __name__ == "__main__": main()