File size: 3,652 Bytes
19ca331 b71645d fa611f5 19ca331 9b8afb3 19ca331 fb5d8fe 56b46a1 b71645d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import os
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
from transformers import pipeline
import gradio as gr
HUGGING_FACE_TOKEN = os.environ["HUGGING_FACE_TOKEN"]
# Vous pouvez choisir parmi les nombreux midèles disponibles sur HugginFace (https://huggingface.co/models)
model_name = "llmware/industry-bert-insurance-v0.1"
def remove_special_characters(string):
return re.sub(r"\n", " ", string)
def RAG_Langchain(query):
embeddings = SentenceTransformerEmbeddings(model_name=model_name)
repo_id = "llmware/bling-sheared-llama-1.3b-0.1"
loader = DirectoryLoader('data/', glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
documents = loader.load()
# La taille des chunks est un paramètre important pour la qualité de l'information retrouvée. Il existe plusieurs méthodes
# pour en choisir la valeur.
# L'overlap correspond au nombre de caractères partagés entre un chunk et le chunk suivant
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
chunk = texts[0]
chunk.page_content = remove_special_characters(chunk.page_content)
#Data Preparation
for chunks in texts:
chunks.page_content = remove_special_characters(chunks.page_content)
# On charge tous les documents dans la base de données vectorielle, pour les utiliser ensuite
vector_stores=Chroma.from_documents(texts, embeddings, collection_metadata = {"hnsw:space": "cosine"}, persist_directory="stores/insurance_cosine")
#Retrieval
load_vector_store=Chroma(persist_directory="stores/insurance_cosine", embedding_function=embeddings)
#On prend pour l'instant k=1, on verra plus tard comment sélectionner les résultats de contexte
docs = load_vector_store.similarity_search_with_score(query=query, k=1)
results = {"Score":[],"Content":[],"Metadata":[]};
for i in docs:
doc, score = i
#print({"Score":score, "Content":doc.page_content, "Metadata":doc.metadata})
results['Score'].append(score)
results['Content'].append(doc.page_content)
results['Metadata'].append(doc.metadata)
context = results['Content']
return results
def generateResponseBasedOnContext(model_name, context_string, query):
question_answerer = pipeline("question-answering", model=model_name)
context_prompt = "You are a sports expert. Answer the user's question by using following context: "
context = context_prompt + context_string
print("context : ", context)
result = question_answerer(question=query, context=context)
return result['answer']
def gradio_adapted_RAG(model_name, query):
context = str(RAG_Langchain(query)['Content'])
generated_answer = generateResponseBasedOnContext(str(model_name),
context,
query)
return generated_answer
dropdown = gr.Dropdown(choices=["distilbert-base-uncased-distilled-squad",
"impira/layoutlm-document-qa",
"impira/layoutlm-invoices"], label="Choose a model")
iface = gr.Interface(fn=gradio_adapted_RAG, inputs=[dropdown, "text"], outputs="text")
iface.launch()
|