ErikH's picture
Update pages/bot.py
71068ec
raw
history blame
No virus
5.7 kB
import streamlit as st
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
import os
from PyPDF2 import PdfReader
from transformers import pipeline
from transformers import AutoModel
#Retriever erweiterung
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub
###########
#pip install faiss-cpu
#pip install langchain
#pip install pypdf
#pip tiktoken
#pip install InstructorEmbedding
###############
# PDF in String umwandeln
def get_pdf_text(folder_path):
text = ""
# Durchsuche alle Dateien im angegebenen Verzeichnis
for filename in os.listdir(folder_path):
filepath = os.path.join(folder_path, filename)
# Überprüfe, ob die Datei die Erweiterung ".pdf" hat
if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
pdf_reader = PdfReader(filepath)
for page in pdf_reader.pages:
text += page.extract_text()
#text += '\n'
return text
#Chunks erstellen
def get_text_chunks(text):
#Arbeitsweise Textsplitter definieren
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
# nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank
def create_vectorstore_and_store():
folder_path = './files'
pdf_text = get_pdf_text(folder_path)
text_chunks = get_text_chunks(pdf_text)
embeddings = HuggingFaceInstructEmbeddings(model_name="deutsche-telekom/bert-multi-english-german-squad2")
#embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2")
# Initiate Faiss DB
vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks,
# Verzeichnis in dem die VektorDB gespeichert werden soll
save_directory = "Store"
#VektorDB lokal speichern
vectorstoreDB.save_local(save_directory)
print(vectorstoreDB)
return None
########
def get_vectorstore():
embeddings = HuggingFaceInstructEmbeddings(model_name="deutsche-telekom/bert-multi-english-german-squad2")
#embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2")
#Abruf lokaler Vektordatenbank
save_directory = "Store"
vectorstoreDB = FAISS.load_local(save_directory, embeddings)
return vectorstoreDB
######
"""
def get_conversation_chain(vectorstore):
llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever()
)
return conversation_chain
"""
#####
def main():
#if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt
user_question = st.text_area("Stell mir eine Frage: ")
#if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt
retriever=get_vectorstore().as_retriever()
retrieved_docs=retriever.invoke(
user_question
)
if user_question:
question=user_question
st.text(user_question)
context=""+retrieved_docs[0].page_content+retrieved_docs[1].page_content+retrieved_docs[3].page_content
context.replace("\n", " ")
st.text("Das ist der Kontext:")
st.text(context)
##IDEE Text Generation
#generator = pipeline('text-generation', model = 'gpt2')
#answer = generator(context, max_length = 30, num_return_sequences=3)
#st.text("FORMATIERTE ANTWORT:")
#st.text_area()
#st.text(answer)
#st.text(type(answer))
# Erstelle die Question Answering-Pipeline für Deutsch
qa_pipeline = pipeline("question-answering", model="deutsche-telekom/bert-multi-english-german-squad2", tokenizer="deutsche-telekom/bert-multi-english-german-squad2")
# Frage beantworten
answer = qa_pipeline(question=question, context=context)
# Gib die Antwort aus
st.text("Basisantwort:")
st.text(answer["answer"])
st.text(answer)
######
#newA = get_conversation_chain(get_vectorstore())
#st.text(newA)
"""
generator = pipeline('text-generation', model = 'tiiuae/falcon-40b')
generator(answer, max_length = 30, num_return_sequences=3)
st.text("Generierte Erweiterung:")
st.text(generator)
"""
"""
#IDEE Retriever erweitern
template = Answer the question based only on the following context:
{context}
Question: {question}
prompt = ChatPromptTemplate.from_template(template)
model = AutoModel.from_pretrained("hkunlp/instructor-base")
def format_docs(docs):
return "\n\n".join([d.page_content for d in docs])
chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| model
| StrOutputParser()
)
ausgabetext = chain.invoke(user_question)
st.text(ausgabetext)
"""
if __name__ == '__main__':
main()