File size: 4,535 Bytes
ff1f92b
5585965
 
 
 
 
 
900a2bf
b884de1
cf0475c
565faf3
 
 
 
6126668
8c1aead
565faf3
5585965
 
 
 
 
 
 
ff1f92b
f5dd29d
5585965
 
 
 
 
 
ff1f92b
5585965
 
 
 
 
 
ff1f92b
5585965
ff1f92b
5585965
 
 
 
 
 
 
 
 
 
 
 
 
7446d35
 
 
 
a9e1591
cc21256
5585965
 
 
 
 
 
 
 
 
 
ff1f92b
5585965
a9e1591
cc21256
5585965
 
 
 
ff1f92b
6126668
 
 
 
5585965
f1e2b8d
 
87177f6
f1e2b8d
 
 
 
 
71068ec
565faf3
171a569
71068ec
7de699a
c94507e
7de699a
71068ec
565faf3
18e53c5
 
 
 
7de699a
 
18e53c5
 
0fc73f9
7de699a
 
 
 
4eaef39
266e137
b041f09
a0bdf98
ff1f92b
5585965
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import streamlit as st
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
import os
from PyPDF2 import PdfReader
from transformers import pipeline
from transformers import AutoModel

#Retriever erweiterung
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub

###########
#pip install faiss-cpu
#pip install langchain
#pip install pypdf
#pip tiktoken
#pip install InstructorEmbedding
###############


# PDF in String umwandeln
def get_pdf_text(folder_path):
    text = ""
    # Durchsuche alle Dateien im angegebenen Verzeichnis
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)

        # Überprüfe, ob die Datei die Erweiterung ".pdf" hat
        if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
            pdf_reader = PdfReader(filepath)
            for page in pdf_reader.pages:
                text += page.extract_text()
            #text += '\n'

    return text

#Chunks erstellen
def get_text_chunks(text):
    #Arbeitsweise Textsplitter definieren
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

# nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank
def create_vectorstore_and_store():
    folder_path = './files'
    pdf_text = get_pdf_text(folder_path)
    text_chunks = get_text_chunks(pdf_text)
    embeddings = HuggingFaceInstructEmbeddings(model_name="deutsche-telekom/bert-multi-english-german-squad2")
    #embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2")
    # Initiate Faiss DB
    vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks,
    # Verzeichnis in dem die VektorDB gespeichert werden soll
    save_directory = "Store"
    #VektorDB lokal speichern
    vectorstoreDB.save_local(save_directory)
    print(vectorstoreDB)
    return None
    
########

def get_vectorstore():
    embeddings = HuggingFaceInstructEmbeddings(model_name="deutsche-telekom/bert-multi-english-german-squad2")
    #embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2")
    #Abruf lokaler Vektordatenbank
    save_directory = "Store"
    vectorstoreDB = FAISS.load_local(save_directory, embeddings)
    return vectorstoreDB

######


#####    
def main():
    #if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt
    user_question = st.text_area("Stell mir eine Frage: ")
            #if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt
    retriever=get_vectorstore().as_retriever()
    retrieved_docs=retriever.invoke(
    user_question
    )
    if user_question:
        
        question=user_question
        st.text(user_question)
        context=""+retrieved_docs[0].page_content+retrieved_docs[1].page_content+retrieved_docs[3].page_content
        context=context.replace("\n", " ")  
        context=context.replace("- ", "")  
        st.text("Das ist der Textausschnitt der durch den Retriever herausgesucht wird:")
        st.text(context)

        # Erstelle die Question Answering-Pipeline für Deutsch
        qa_pipeline = pipeline("question-answering", model="deutsche-telekom/bert-multi-english-german-squad2", tokenizer="deutsche-telekom/bert-multi-english-german-squad2")

        # Frage beantworten
        #answer = qa_pipeline(question=question, context=context, top_k=3)
        answer = qa_pipeline(question=question, context=context)
        
        # Gib die Antwort aus
        st.text("Basisantwort:")
        st.text(answer["answer"])
        #st.text(answer)

        #Die Basisantwort müsste man jetzt ausformulieren
        text2text_generator = pipeline("text2text-generation", model="google/flan-t5-xxl")
        #newText=text2text_generator(question=question, context=answer)
        newText=text2text_generator("Formuliere einen neuen Satz. Frage: "+question+ " Antwort: " + answer["answer"])
        st.text(newText)

if __name__ == '__main__':
    main()