ArticleChatbot

Sleeping

App Files Files Community

ArticleChatbot / streamlit_app.py

lfoppiano

add more embedding functions

8520312 5 months ago

raw

history blame

19.6 kB

	import os
	import re
	from hashlib import blake2b
	from tempfile import NamedTemporaryFile

	import dotenv
	from grobid_quantities.quantities import QuantitiesAPI
	from langchain.memory import ConversationBufferWindowMemory
	from langchain_community.chat_models.openai import ChatOpenAI
	from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
	from langchain_community.embeddings.openai import OpenAIEmbeddings
	from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
	from streamlit_pdf_viewer import pdf_viewer

	from document_qa.ner_client_generic import NERClientGeneric

	dotenv.load_dotenv(override=True)

	import streamlit as st
	from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
	from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations

	OPENAI_MODELS = ['gpt-3.5-turbo',
	"gpt-4",
	"gpt-4-1106-preview"]

	OPENAI_EMBEDDINGS = [
	'text-embedding-ada-002',
	'text-embedding-3-large',
	'openai-text-embedding-3-small'
	]

	OPEN_MODELS = {
	'mistral-7b-instruct-v0.3': 'mistralai/Mistral-7B-Instruct-v0.2',
	# 'Phi-3-mini-128k-instruct': "microsoft/Phi-3-mini-128k-instruct",
	'Phi-3-mini-4k-instruct': "microsoft/Phi-3-mini-4k-instruct"
	}

	DEFAULT_OPEN_EMBEDDING_NAME = 'Default (all-MiniLM-L6-v2)'
	OPEN_EMBEDDINGS = {
	DEFAULT_OPEN_EMBEDDING_NAME: 'all-MiniLM-L6-v2',
	'SFR-Embedding-Mistral': 'Salesforce/SFR-Embedding-Mistral',
	'SFR-Embedding-2_R': 'Salesforce/SFR-Embedding-2_R',
	'NV-Embed': 'nvidia/NV-Embed-v1',
	'e5-mistral-7b-instruct': 'intfloat/e5-mistral-7b-instruct'
	}

	if 'rqa' not in st.session_state:
	st.session_state['rqa'] = {}

	if 'model' not in st.session_state:
	st.session_state['model'] = None

	if 'api_keys' not in st.session_state:
	st.session_state['api_keys'] = {}

	if 'doc_id' not in st.session_state:
	st.session_state['doc_id'] = None

	if 'loaded_embeddings' not in st.session_state:
	st.session_state['loaded_embeddings'] = None

	if 'hash' not in st.session_state:
	st.session_state['hash'] = None

	if 'git_rev' not in st.session_state:
	st.session_state['git_rev'] = "unknown"
	if os.path.exists("revision.txt"):
	with open("revision.txt", 'r') as fr:
	from_file = fr.read()
	st.session_state['git_rev'] = from_file if len(from_file) > 0 else "unknown"

	if "messages" not in st.session_state:
	st.session_state.messages = []

	if 'ner_processing' not in st.session_state:
	st.session_state['ner_processing'] = False

	if 'uploaded' not in st.session_state:
	st.session_state['uploaded'] = False

	if 'memory' not in st.session_state:
	st.session_state['memory'] = None

	if 'binary' not in st.session_state:
	st.session_state['binary'] = None

	if 'annotations' not in st.session_state:
	st.session_state['annotations'] = None

	if 'should_show_annotations' not in st.session_state:
	st.session_state['should_show_annotations'] = True

	if 'pdf' not in st.session_state:
	st.session_state['pdf'] = None

	if 'embeddings' not in st.session_state:
	st.session_state['embeddings'] = None

	st.set_page_config(
	page_title="Scientific Document Insights Q/A",
	page_icon="📝",
	initial_sidebar_state="expanded",
	layout="wide",
	menu_items={
	'Get Help': 'https://github.com/lfoppiano/document-qa',
	'Report a bug': "https://github.com/lfoppiano/document-qa/issues",
	'About': "Upload a scientific article in PDF, ask questions, get insights."
	}
	)


	def new_file():
	st.session_state['loaded_embeddings'] = None
	st.session_state['doc_id'] = None
	st.session_state['uploaded'] = True
	if st.session_state['memory']:
	st.session_state['memory'].clear()


	def clear_memory():
	st.session_state['memory'].clear()


	# @st.cache_resource
	def init_qa(model, embeddings_name=None, api_key=None):
	## For debug add: callbacks=[PromptLayerCallbackHandler(pl_tags=["langchain", "chatgpt", "document-qa"])])
	if model in OPENAI_MODELS:
	if embeddings_name is None:
	embeddings_name = 'text-embedding-ada-002'

	st.session_state['memory'] = ConversationBufferWindowMemory(k=4)
	if api_key:
	chat = ChatOpenAI(model_name=model,
	temperature=0,
	openai_api_key=api_key,
	frequency_penalty=0.1)
	if embeddings_name not in OPENAI_EMBEDDINGS:
	st.error(f"The embeddings provided {embeddings_name} are not supported by this model {model}.")
	st.stop()
	return
	embeddings = OpenAIEmbeddings(model=embeddings_name, openai_api_key=api_key)

	else:
	chat = ChatOpenAI(model_name=model,
	temperature=0,
	frequency_penalty=0.1)
	embeddings = OpenAIEmbeddings(model=embeddings_name)

	elif model in OPEN_MODELS:
	if embeddings_name is None:
	embeddings_name = DEFAULT_OPEN_EMBEDDING_NAME

	chat = HuggingFaceEndpoint(
	repo_id=OPEN_MODELS[model],
	temperature=0.01,
	max_new_tokens=2048,
	model_kwargs={"max_length": 4096}
	)
	embeddings = HuggingFaceEmbeddings(
	model_name=OPEN_EMBEDDINGS[embeddings_name])
	# st.session_state['memory'] = ConversationBufferWindowMemory(k=4) if model not in DISABLE_MEMORY else None
	else:
	st.error("The model was not loaded properly. Try reloading. ")
	st.stop()
	return

	storage = DataStorage(embeddings)
	return DocumentQAEngine(chat, storage, grobid_url=os.environ['GROBID_URL'], memory=st.session_state['memory'])


	@st.cache_resource
	def init_ner():
	quantities_client = QuantitiesAPI(os.environ['GROBID_QUANTITIES_URL'], check_server=True)

	materials_client = NERClientGeneric(ping=True)
	config_materials = {
	'grobid': {
	"server": os.environ['GROBID_MATERIALS_URL'],
	'sleep_time': 5,
	'timeout': 60,
	'url_mapping': {
	'processText_disable_linking': "/service/process/text?disableLinking=True",
	# 'processText_disable_linking': "/service/process/text"
	}
	}
	}

	materials_client.set_config(config_materials)

	gqa = GrobidAggregationProcessor(grobid_quantities_client=quantities_client,
	grobid_superconductors_client=materials_client)
	return gqa


	gqa = init_ner()


	def get_file_hash(fname):
	hash_md5 = blake2b()
	with open(fname, "rb") as f:
	for chunk in iter(lambda: f.read(4096), b""):
	hash_md5.update(chunk)
	return hash_md5.hexdigest()


	def play_old_messages(container):
	if st.session_state['messages']:
	for message in st.session_state['messages']:
	if message['role'] == 'user':
	container.chat_message("user").markdown(message['content'])
	elif message['role'] == 'assistant':
	if mode == "LLM":
	container.chat_message("assistant").markdown(message['content'], unsafe_allow_html=True)
	else:
	container.chat_message("assistant").write(message['content'])


	# is_api_key_provided = st.session_state['api_key']

	with st.sidebar:
	st.title("📝 Scientific Document Insights Q/A")
	st.subheader("Upload a scientific article in PDF, ask questions, get insights.")
	st.markdown(
	":warning: [Usage disclaimer](https://github.com/lfoppiano/document-qa?tab=readme-ov-file#disclaimer-on-data-security-and-privacy-%EF%B8%8F) :warning: ")

	st.divider()
	st.session_state['model'] = model = st.selectbox(
	"Model:",
	options=OPENAI_MODELS + list(OPEN_MODELS.keys()),
	index=(OPENAI_MODELS + list(OPEN_MODELS.keys())).index(
	os.environ["DEFAULT_MODEL"]) if "DEFAULT_MODEL" in os.environ and os.environ["DEFAULT_MODEL"] else 0,
	placeholder="Select model",
	help="Select the LLM model:",
	disabled=st.session_state['doc_id'] is not None or st.session_state['uploaded']
	)
	embedding_choices = OPENAI_EMBEDDINGS if model in OPENAI_MODELS else OPEN_EMBEDDINGS

	st.session_state['embeddings'] = embedding_name = st.selectbox(
	"Embeddings:",
	options=embedding_choices,
	index=0,
	placeholder="Select embedding",
	help="Select the Embedding function:",
	disabled=st.session_state['doc_id'] is not None or st.session_state['uploaded']
	)

	if (model in OPEN_MODELS) and model not in st.session_state['api_keys']:
	if 'HUGGINGFACEHUB_API_TOKEN' not in os.environ:
	api_key = st.text_input('Huggingface API Key', type="password")

	st.markdown("Get it [here](https://huggingface.co/docs/hub/security-tokens)")
	else:
	api_key = os.environ['HUGGINGFACEHUB_API_TOKEN']

	if api_key:
	# st.session_state['api_key'] = is_api_key_provided = True
	if model not in st.session_state['rqa'] or model not in st.session_state['api_keys']:
	with st.spinner("Preparing environment"):
	st.session_state['api_keys'][model] = api_key
	# if 'HUGGINGFACEHUB_API_TOKEN' not in os.environ:
	# os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
	st.session_state['rqa'][model] = init_qa(model, embedding_name)

	elif model in OPENAI_MODELS and model not in st.session_state['api_keys']:
	if 'OPENAI_API_KEY' not in os.environ:
	api_key = st.text_input('OpenAI API Key', type="password")
	st.markdown("Get it [here](https://platform.openai.com/account/api-keys)")
	else:
	api_key = os.environ['OPENAI_API_KEY']

	if api_key:
	if model not in st.session_state['rqa'] or model not in st.session_state['api_keys']:
	with st.spinner("Preparing environment"):
	st.session_state['api_keys'][model] = api_key
	if 'OPENAI_API_KEY' not in os.environ:
	st.session_state['rqa'][model] = init_qa(model, st.session_state['embeddings'], api_key)
	else:
	st.session_state['rqa'][model] = init_qa(model, st.session_state['embeddings'])
	# else:
	# is_api_key_provided = st.session_state['api_key']

	# st.button(
	# 'Reset chat memory.',
	# key="reset-memory-button",
	# on_click=clear_memory,
	# help="Clear the conversational memory. Currently implemented to retrain the 4 most recent messages.",
	# disabled=model in st.session_state['rqa'] and st.session_state['rqa'][model].memory is None)

	left_column, right_column = st.columns([1, 1])
	right_column = right_column.container(border=True)
	left_column = left_column.container(border=True)

	with right_column:
	uploaded_file = st.file_uploader(
	"Upload an article",
	type=("pdf", "txt"),
	on_change=new_file,
	disabled=st.session_state['model'] is not None and st.session_state['model'] not in
	st.session_state['api_keys'],
	help="The full-text is extracted using Grobid."
	)

	placeholder = st.empty()
	messages = st.container(height=300)

	question = st.chat_input(
	"Ask something about the article",
	# placeholder="Can you give me a short summary?",
	disabled=not uploaded_file
	)

	query_modes = {
	"llm": "LLM Q/A",
	"embeddings": "Embeddings",
	"question_coefficient": "Question coefficient"
	}

	with st.sidebar:
	st.header("Settings")
	mode = st.radio(
	"Query mode",
	("llm", "embeddings", "question_coefficient"),
	disabled=not uploaded_file,
	index=0,
	horizontal=True,
	format_func=lambda x: query_modes[x],
	help="LLM will respond the question, Embedding will show the "
	"relevant paragraphs to the question in the paper. "
	"Question coefficient attempt to estimate how effective the question will be answered."
	)
	st.session_state['ner_processing'] = st.checkbox(
	"Identify materials and properties.",
	help='The LLM responses undergo post-processing to extract physical quantities, measurements, and materials mentions.'
	)

	# Add a checkbox for showing annotations
	# st.session_state['show_annotations'] = st.checkbox("Show annotations", value=True)
	# st.session_state['should_show_annotations'] = st.checkbox("Show annotations", value=True)

	chunk_size = st.slider("Text chunks size", -1, 2000, value=-1,
	help="Size of chunks in which split the document. -1: use paragraphs, > 0 paragraphs are aggregated.",
	disabled=uploaded_file is not None)
	if chunk_size == -1:
	context_size = st.slider("Context size (paragraphs)", 3, 20, value=10,
	help="Number of paragraphs to consider when answering a question",
	disabled=not uploaded_file)
	else:
	context_size = st.slider("Context size (chunks)", 3, 10, value=4,
	help="Number of chunks to consider when answering a question",
	disabled=not uploaded_file)

	st.divider()

	st.header("Documentation")
	st.markdown("https://github.com/lfoppiano/document-qa")
	st.markdown(
	"""Upload a scientific article as PDF document. Once the spinner stops, you can proceed to ask your questions.""")

	if st.session_state['git_rev'] != "unknown":
	st.markdown("Revision number: [" + st.session_state[
	'git_rev'] + "](https://github.com/lfoppiano/document-qa/commit/" + st.session_state['git_rev'] + ")")

	if uploaded_file and not st.session_state.loaded_embeddings:
	if model not in st.session_state['api_keys']:
	st.error("Before uploading a document, you must enter the API key. ")
	st.stop()

	with left_column:
	with st.spinner('Reading file, calling Grobid, and creating memory embeddings...'):
	binary = uploaded_file.getvalue()
	tmp_file = NamedTemporaryFile()
	tmp_file.write(bytearray(binary))
	st.session_state['binary'] = binary

	st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
	chunk_size=chunk_size,
	perc_overlap=0.1)
	st.session_state['loaded_embeddings'] = True
	st.session_state.messages = []


	def rgb_to_hex(rgb):
	return "#{:02x}{:02x}{:02x}".format(*rgb)


	def generate_color_gradient(num_elements):
	# Define warm and cold colors in RGB format
	warm_color = (255, 165, 0) # Orange
	cold_color = (0, 0, 255) # Blue

	# Generate a linear gradient of colors
	color_gradient = [
	rgb_to_hex(tuple(int(warm * (1 - i / num_elements) + cold * (i / num_elements)) for warm, cold in
	zip(warm_color, cold_color)))
	for i in range(num_elements)
	]

	return color_gradient


	with right_column:
	if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
	for message in st.session_state.messages:
	with messages.chat_message(message["role"]):
	if message['mode'] == "llm":
	messages.chat_message(message["role"]).markdown(message["content"], unsafe_allow_html=True)
	elif message['mode'] == "embeddings":
	messages.chat_message(message["role"]).write(message["content"])
	if message['mode'] == "question_coefficient":
	messages.chat_message(message["role"]).markdown(message["content"], unsafe_allow_html=True)
	if model not in st.session_state['rqa']:
	st.error("The API Key for the " + model + " is missing. Please add it before sending any query. `")
	st.stop()

	messages.chat_message("user").markdown(question)
	st.session_state.messages.append({"role": "user", "mode": mode, "content": question})

	text_response = None
	if mode == "embeddings":
	with placeholder:
	with st.spinner("Fetching the relevant context..."):
	text_response, coordinates = st.session_state['rqa'][model].query_storage(
	question,
	st.session_state.doc_id,
	context_size=context_size
	)
	elif mode == "llm":
	with placeholder:
	with st.spinner("Generating LLM response..."):
	_, text_response, coordinates = st.session_state['rqa'][model].query_document(
	question,
	st.session_state.doc_id,
	context_size=context_size
	)

	elif mode == "question_coefficient":
	with st.spinner("Estimate question/context relevancy..."):
	text_response, coordinates = st.session_state['rqa'][model].analyse_query(
	question,
	st.session_state.doc_id,
	context_size=context_size
	)

	annotations = [[GrobidAggregationProcessor.box_to_dict([cs for cs in c.split(",")]) for c in coord_doc]
	for coord_doc in coordinates]
	gradients = generate_color_gradient(len(annotations))
	for i, color in enumerate(gradients):
	for annotation in annotations[i]:
	annotation['color'] = color
	st.session_state['annotations'] = [annotation for annotation_doc in annotations for annotation in
	annotation_doc]

	if not text_response:
	st.error("Something went wrong. Contact Luca Foppiano ([email protected]) to report the issue.")

	if mode == "llm":
	if st.session_state['ner_processing']:
	with st.spinner("Processing NER on LLM response..."):
	entities = gqa.process_single_text(text_response)
	decorated_text = decorate_text_with_annotations(text_response.strip(), entities)
	decorated_text = decorated_text.replace('class="label material"', 'style="color:green"')
	decorated_text = re.sub(r'class="label[^"]+"', 'style="color:orange"', decorated_text)
	text_response = decorated_text
	messages.chat_message("assistant").markdown(text_response, unsafe_allow_html=True)
	else:
	messages.chat_message("assistant").write(text_response)
	st.session_state.messages.append({"role": "assistant", "mode": mode, "content": text_response})

	elif st.session_state.loaded_embeddings and st.session_state.doc_id:
	play_old_messages(messages)

	with left_column:
	if st.session_state['binary']:
	pdf_viewer(
	input=st.session_state['binary'],
	annotation_outline_size=2,
	annotations=st.session_state['annotations'],
	render_text=True,
	height=600
	)