import streamlit as st from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.vectorstores import FAISS from langchain.text_splitter import CharacterTextSplitter from langchain.document_loaders import DirectoryLoader, PyPDFLoader import os from PyPDF2 import PdfReader from transformers import pipeline from transformers import AutoModel #Retriever erweiterung from langchain.prompts import ChatPromptTemplate from langchain.schema import StrOutputParser from langchain.schema.runnable import RunnablePassthrough from langchain.chains import ConversationalRetrievalChain ########### #pip install faiss-cpu #pip install langchain #pip install pypdf #pip tiktoken #pip install InstructorEmbedding ############### # PDF in String umwandeln def get_pdf_text(folder_path): text = "" # Durchsuche alle Dateien im angegebenen Verzeichnis for filename in os.listdir(folder_path): filepath = os.path.join(folder_path, filename) # Überprüfe, ob die Datei die Erweiterung ".pdf" hat if os.path.isfile(filepath) and filename.lower().endswith(".pdf"): pdf_reader = PdfReader(filepath) for page in pdf_reader.pages: text += page.extract_text() #text += '\n' return text #Chunks erstellen def get_text_chunks(text): #Arbeitsweise Textsplitter definieren text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len ) chunks = text_splitter.split_text(text) return chunks # nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank def create_vectorstore_and_store(): folder_path = './files' pdf_text = get_pdf_text(folder_path) text_chunks = get_text_chunks(pdf_text) embeddings = HuggingFaceInstructEmbeddings(model_name="deutsche-telekom/bert-multi-english-german-squad2") #embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2") # Initiate Faiss DB vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks, # Verzeichnis in dem die VektorDB gespeichert werden soll save_directory = "Store" #VektorDB lokal speichern vectorstoreDB.save_local(save_directory) print(vectorstoreDB) return None ######## def get_vectorstore(): embeddings = HuggingFaceInstructEmbeddings(model_name="deutsche-telekom/bert-multi-english-german-squad2") #embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2") #Abruf lokaler Vektordatenbank save_directory = "Store" vectorstoreDB = FAISS.load_local(save_directory, embeddings) return vectorstoreDB ###### def get_conversation_chain(vectorstore): llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512}) conversation_chain = ConversationalRetrievalChain.from_llm( llm=llm, retriever=vectorstore.as_retriever() ) return conversation_chain ##### def main(): #if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt user_question = st.text_area("Stell mir eine Frage: ") #if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt retriever=get_vectorstore().as_retriever() retrieved_docs=retriever.invoke( user_question ) if user_question: st.text(retrieved_docs[0].page_content) context=retrieved_docs[0].page_content question=user_question st.text(user_question) ##IDEE Text Generation #generator = pipeline('text-generation', model = 'gpt2') #answer = generator(context, max_length = 30, num_return_sequences=3) #st.text("FORMATIERTE ANTWORT:") #st.text_area() #st.text(answer) #st.text(type(answer)) # Erstelle die Question Answering-Pipeline für Deutsch qa_pipeline = pipeline("question-answering", model="deutsche-telekom/bert-multi-english-german-squad2", tokenizer="deutsche-telekom/bert-multi-english-german-squad2") # Frage beantworten answer = qa_pipeline(question=question, context=context) # Gib die Antwort aus st.text("Basisantwort:") st.text(answer["answer"]) st.text(answer) ###### newA = get_conversation_chain(get_vectorstore()) st.text(newA) """ generator = pipeline('text-generation', model = 'tiiuae/falcon-40b') generator(answer, max_length = 30, num_return_sequences=3) st.text("Generierte Erweiterung:") st.text(generator) """ """ #IDEE Retriever erweitern template = Answer the question based only on the following context: {context} Question: {question} prompt = ChatPromptTemplate.from_template(template) model = AutoModel.from_pretrained("hkunlp/instructor-base") def format_docs(docs): return "\n\n".join([d.page_content for d in docs]) chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | model | StrOutputParser() ) ausgabetext = chain.invoke(user_question) st.text(ausgabetext) """ if __name__ == '__main__': main()