2 / handler.py

Update handler.py

0c29b88 verified 7 months ago

6.17 kB

	import torch
	import locale
	import os
	from typing import Dict, List, Any
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
	from langchain.llms import HuggingFacePipeline
	from langchain.retrievers.document_compressors import LLMChainExtractor
	from langchain.retrievers import ContextualCompressionRetriever
	from langchain.vectorstores import Chroma
	from langchain import PromptTemplate, LLMChain
	from langchain.chains import RetrievalQA, ConversationalRetrievalChain
	from langchain.prompts import PromptTemplate
	from langchain.prompts.prompt import PromptTemplate
	from langchain.memory import ConversationBufferMemory
	from langchain.embeddings import HuggingFaceBgeEmbeddings
	from langchain.document_loaders import WebBaseLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains.qa_with_sources import load_qa_with_sources_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
	from langchain_core.messages import HumanMessage
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.runnables import RunnableLambda, RunnableBranch, RunnablePassthrough
	from operator import itemgetter
	from langchain.schema import format_document
	from langchain.memory import ConversationBufferMemory
	from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
	from langchain_core.runnables import RunnableParallel
	from typing import Optional
	from langchain.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores.utils import DistanceStrategy

	EMBEDDING_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
	MARKDOWN_SEPARATORS = [
	"\n#{1,6} ",
	"```\n",
	"\n\\\\\\*+\n",
	"\n---+\n",
	"\n___+\n",
	"\n\n",
	"\n",
	" ",
	"",
	]

	class EndpointHandler():

	def __init__(self, path=""):
	# Load Vector db
	urls = [
	"https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2).html",
	"https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2)/publications.html",
	"https://www.cityu.edu.hk/media/press-release/2022/05/17/cityu-council-announces-appointment-professor-freddy-boey-next-president",
	"https://www.cityu.edu.hk/media/press-release/2023/05/18/professor-freddy-boey-installed-5th-president-cityu"
	]

	loader = WebBaseLoader(urls)
	docs = loader.load()

	text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
	AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME),
	chunk_size=512,
	chunk_overlap=int(512 / 10),
	add_start_index=True,
	strip_whitespace=True,
	separators=MARKDOWN_SEPARATORS,
	)

	docs_processed = []
	for doc in docs:
	docs_processed += text_splitter.split_documents([doc])

	# Remove duplicates
	unique_texts = {}
	docs_processed_unique = []
	for doc in docs_processed:
	if doc.page_content not in unique_texts:
	unique_texts[doc.page_content] = True
	docs_processed_unique.append(doc)

	embedding_model = HuggingFaceEmbeddings(
	model_name=EMBEDDING_MODEL_NAME,
	multi_process=True,
	model_kwargs={"device": "cuda"},
	encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity
	)

	self.vectorstore = FAISS.from_documents(
	docs_processed_unique, embedding_model, distance_strategy=DistanceStrategy.COSINE
	)

	# Create LLM
	READER_MODEL_NAME = path

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	)
	model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
	tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

	# Testing
	# tokenizer.pad_token = tokenizer.eos_token

	self.READER_LLM = pipeline(
	model=model,
	tokenizer=tokenizer,
	task="text-generation",
	do_sample=True,
	temperature=0.2,
	repetition_penalty=1.1,
	return_full_text=False,
	max_new_tokens=256,
	)

	prompt_in_chat_format = [
	{
	"role": "system",
	"content": """Using the information contained in the context.
	Respond only to the question asked, response should be concise and relevant to the question.
	If the answer cannot be deduced from the context, do not give an answer.""",
	},
	{
	"role": "user",
	"content": """Context: {context}
	Now here is the question you need to answer.
	Question: {question}""",
	},
	]

	self.RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
	prompt_in_chat_format, tokenize=False, add_generation_prompt=True
	)

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	# get inputs
	inputs = data.pop("inputs",data)
	date = data.pop("date", None)

	retrieved_docs = self.vectorstore.similarity_search(query=inputs, k=2)

	retrieved_docs_text = [
	doc.page_content for doc in retrieved_docs
	] # we only need the text of the documents
	context = "\nExtracted documents:\n"
	context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

	final_prompt = self.RAG_PROMPT_TEMPLATE.format(
	question=inputs, context=context
	)

	# Redact an answer
	answer = self.READER_LLM(final_prompt)[0]

	return answer