Ask-Your-PDFs / llm_chain.py
akarshrajsingh7's picture
Open source edition
fecabd0
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from app_style import css, bot_template, user_template
from langchain.llms import HuggingFaceHub
class RAG_PDF:
'''
Class for implementing RAGs for answering questions from PDFs
'''
def __init__(self, pdf_docs, model = "open-source"):
'''
Initializing the constructor
'''
self.pdf_docs = pdf_docs
if model=="open-source":
# Open Source model to generate embeddings for the text
self.embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
# Open Source model to generate response (Current model used is T5-XXL)
self.llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
elif model=="openai":
# OpenAI model to generate embeddings for the text
self.embeddings = OpenAIEmbeddings()
# OpenAI model to generate response
self.llm = ChatOpenAI()
def pdf_extract_text(self):
'''
Extracting text from the PDFs
'''
text = ""
for pdf in self.pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def pdf_chunkize(self, text):
'''
Chunking the text into smaller chunks
'''
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200, #context aware chunking
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
def pdf_vectorstore(self, text_chunks):
'''
Creating vector store for the text chunks
'''
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=self.embeddings)
return vectorstore
def pdf_conversation_chain(self, vectorstore):
memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=self.llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
return conversation_chain
def activate_RAG_pipeline(self):
# get pdf text
raw_text = self.pdf_extract_text()
# get the text chunks
text_chunks = self.pdf_chunkize(raw_text)
# create vector store
vectorstore = self.pdf_vectorstore(text_chunks)
# create conversation chain
conversation_chain = self.pdf_conversation_chain(vectorstore)
return conversation_chain