import streamlit as st |
from langchain.embeddings import HuggingFaceInstructEmbeddings |
from langchain.vectorstores import FAISS |
from langchain.text_splitter import CharacterTextSplitter |
from langchain.document_loaders import DirectoryLoader, PyPDFLoader |
import os |
from PyPDF2 import PdfReader |
from transformers import pipeline |
from transformers import AutoModel |
from langchain.prompts import ChatPromptTemplate |
from langchain.schema import StrOutputParser |
from langchain.schema.runnable import RunnablePassthrough |
def get_pdf_text(folder_path): |
text = "" |
for filename in os.listdir(folder_path): |
filepath = os.path.join(folder_path, filename) |
if os.path.isfile(filepath) and filename.lower().endswith(".pdf"): |
pdf_reader = PdfReader(filepath) |
for page in pdf_reader.pages: |
text += page.extract_text() |
return text |
def get_text_chunks(text): |
text_splitter = CharacterTextSplitter( |
separator="\n", |
chunk_size=1000, |
chunk_overlap=200, |
length_function=len |
) |
chunks = text_splitter.split_text(text) |
return chunks |
def create_vectorstore_and_store(): |
folder_path = './files' |
pdf_text = get_pdf_text(folder_path) |
text_chunks = get_text_chunks(pdf_text) |
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base") |
vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings) |
save_directory = "Store" |
vectorstoreDB.save_local(save_directory) |
print(vectorstoreDB) |
return None |
def get_vectorstore(): |
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base") |
save_directory = "Store" |
vectorstoreDB = FAISS.load_local(save_directory, embeddings) |
return vectorstoreDB |
def main(): |
user_question = st.text_area("Stell mir eine Frage: ") |
retriever=get_vectorstore().as_retriever() |
retrieved_docs=retriever.invoke( |
user_question |
) |
if user_question: |
st.text(retrieved_docs[0].page_content) |
context=retrieved_docs[0].page_content |
question=user_question |
st.text(user_question) |
generator = pipeline('text-generation', model = 'gpt2') |
answer = generator(context, max_length = 30, num_return_sequences=3) |
st.text(answer) |
st.text(type(answer)) |
template = """Answer the question based only on the following context: |
{context} |
Question: {question} |
""" |
prompt = ChatPromptTemplate.from_template(template) |
model = AutoModel.from_pretrained("hkunlp/instructor-base") |
def format_docs(docs): |
return "\n\n".join([d.page_content for d in docs]) |
chain = ( |
{"context": retriever | format_docs, "question": RunnablePassthrough()} |
| prompt |
| model |
| StrOutputParser() |
) |
ausgabetext = chain.invoke(user_question) |
st.text(ausgabetext) |
if __name__ == '__main__': |
main() |