Spaces:

kzyd
/

Thoughtful-AI-Screening

Sleeping

Kazuki Yoda

Implement the logic to get predefined answer

f5b8cbf about 1 month ago

3.27 kB

	"""This entire file was solely written by the applicant, Kazuki Yoda."""

	import json
	from typing import Optional

	# # For Debugging only
	# from scipy.spatial import distance_matrix
	# from sklearn.metrics.pairwise import cosine_similarity

	from huggingface_hub import InferenceClient

	zero_shot_classification_client = InferenceClient("facebook/bart-large-mnli")


	def load_predefined_questions_to_answers_as_dict(path="predefined.json"
	) -> dict[str, str]:
	"""Load the predefined question-answer pairs as dict of.
	key: question (str), value: answer (str)"""

	with open(path) as file:
	data = json.load(file)

	if "questions" not in data:
	raise ValueError("`questions` key is expected but missing.")

	question_to_answer = dict()

	for item in data.get("questions"):
	question = item.get("question")
	answer = item.get("answer")

	# Skip if either "question" or "answer" key not found
	if question and answer:
	question_to_answer[question] = answer

	return question_to_answer


	def get_embeddings(texts: list[str]):
	client = InferenceClient("efederici/sentence-bert-base")

	return [client.feature_extraction(text) for text in texts]


	def get_predefined_answer_for_closest_predefined_question(
	question: str,
	cutoff=0.5, # Minimum classification score to use the predefined answer
	) -> Optional[str]:

	question_to_answer = load_predefined_questions_to_answers_as_dict()
	labels = list(question_to_answer.keys())

	zero_shot_classification_result = zero_shot_classification_client.zero_shot_classification(
	text=question,
	labels=labels,
	multi_label=True,
	)
	max_score_result = max(zero_shot_classification_result,
	key=lambda x: x.score)

	if max_score_result.score > cutoff:
	closest_predefined_question = max_score_result.label
	return question_to_answer[closest_predefined_question]
	else:
	# Switch back to the normal LLM response
	return None


	if __name__ == "__main__":
	"""Run some print debugs. Not executed from the Gradio app."""

	question_to_answer = load_predefined_questions_to_answers_as_dict()
	print(question_to_answer)

	additional_questions = [
	"What does EVA do?",
	"How does PHIL work?",
	"Thoughtful AI",
	### Irrelevant but confusing questions ###
	"Who is the CEO of Thoughtful AI?",
	"How much does Thoughtful AI pay for its ML engineers?",
	"What's Evangelion (EVA)?"
	]
	predefined_questions = list(question_to_answer.keys())
	questions = predefined_questions + additional_questions

	embeddings = get_embeddings(questions)

	for embedding in embeddings:
	print(embedding.shape)

	# For DEBUG, check the embeddings
	# print(distance_matrix(embeddings, embeddings[:len(predefined_questions)]))
	# print(cosine_similarity(embeddings, embeddings[:len(predefined_questions)]))

	for question in questions:
	closest_question = get_predefined_answer_for_closest_predefined_question(question)
	print(f"question: {question}")
	print(f"closest_question: {closest_question}")
	print()