Spaces:

evegarcianz
/

Botlitica

Sleeping

App Files Files Community

Botlitica / app.py

evegarcianz

wording in italian

45d63b9 over 1 year ago

raw

history blame contribute delete

9.56 kB

	import pandas as pd
	import re
	import openai
	from openai.api_resources import engine
	import os
	import gradio as gr

	openai.api_key=os.environ['ed_key']
	#openai.api_key=""
	import pandas as pd
	import openai
	import numpy
	import numpy as np
	from transformers import GPT2TokenizerFast
	from numpy.linalg import norm
	tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")



	#Meloni dataframe
	df=pd.read_csv('roboGiorgia_3docsCleaner.csv')
	df.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
	df.set_index(['number', 'prompt'], inplace=True)
	df['tokens'] = df['content'].apply(lambda x: len(tokenizer.tokenize(x)))

	#Calenda dataframe
	df_calenda=pd.read_csv('roboCalenda_3docsCleaner_84prompting.csv')
	df_calenda.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
	df_calenda.set_index(['number', 'prompt'], inplace=True)
	df_calenda['tokens'] = df_calenda['content'].apply(lambda x: len(tokenizer.tokenize(x)))

	#Letta dataframe
	df_letta=pd.read_csv('roboLetta_3source_cleaner_84prompting.csv')
	df_letta.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
	df_letta.set_index(['number', 'prompt'], inplace=True)
	df_letta['tokens'] = df_letta['content'].apply(lambda x: len(tokenizer.tokenize(x)))

	COMPLETIONS_MODEL = "text-davinci-003"

	COMPLETIONS_API_PARAMS = {
	"temperature": 1,
	"max_tokens": 300,
	"model": COMPLETIONS_MODEL,
	}

	def get_embedding(text: str, model: str):
	"""
	Create an embedding for any string passed using the OpenAI Embeddings API given a chosen GPT-3 model.

	Return an embedding vector.
	"""
	result = openai.Embedding.create(
	model=model,
	input=text
	)
	return result["data"][0]["embedding"]

	def get_doc_embedding(text):
	"""
	This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions
	from this file.

	Return an embedding vector.
	"""
	return get_embedding(text, 'text-embedding-ada-002')



	def get_query_embedding(text):
	"""
	This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions
	from this file.

	Return an embedding vector.
	"""
	return get_embedding(text, 'text-embedding-ada-002')



	def compute_doc_embeddings(df: pd.DataFrame):
	"""
	Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

	Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
	"""
	return {
	idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows()
	}

	def vector_similarity(x, y):
	"""
	Cosine similarity

	"""
	return np.dot(np.array(x), np.array(y))/ (norm(x)*norm(y))

	def order_document_sections_by_query_similarity(query, contexts):
	"""
	Finds the query embedding and compares it against all of the pre-calculated document embeddings
	to find the most relevant sections.

	Return the list of document sections, sorted by relevance in descending order.
	"""
	query_embedding = get_query_embedding(query)

	document_similarities = sorted([
	(vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
	], reverse=True)

	return document_similarities

	def construct_prompt(pre,question, context_embeddings, df):
	"""
	This function works based on the the function order_document_sections_by_query_similarity. It will construct a prompt using the
	the most relevant sections of the document.

	MAX_SECTION_LEN (in tokens) variable is key because it will control how many sections are concatenated in the prompt.
	The function will stop joining sections once MAX_SECTION_LEN is reached.

	header variable is also key because it will instruct clearly to answer only based on the context and how to answer if it does not know the answer.
	This header is based on openai documentation.
	https://beta.openai.com/docs/guides/fine-tuning/example-notebooks

	Return the complete prompt and the long_context which is the union of the chosen most relevant sections.

	"""
	MAX_SECTION_LEN = 1650
	SEPARATOR = "\n* "

	tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
	separator_len = len(tokenizer.tokenize(SEPARATOR))

	most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)

	chosen_sections = []
	chosen_sections_len = 0
	chosen_sections_indexes = []
	chosen_sections_links= []

	for simi, section_index in most_relevant_document_sections:
	#for _, section_index in most_relevant_document_sections:
	# Add contexts until we run out of space.
	document_section = df.loc[section_index]

	chosen_sections_len += document_section.tokens + separator_len
	if chosen_sections_len > MAX_SECTION_LEN:
	break

	chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
	#chosen_sections_indexes.append(str(section_index))
	chosen_sections_indexes.append(str(simi)+' '+str(section_index))
	#chosen_sections_links.append( document_section.link)


	# Useful diagnostic information
	print(f"Selected {len(chosen_sections)} document sections:")
	#print("\n".join(chosen_sections_indexes))

	preprompt = """"\n\nContext:\n"""
	preprompt= pre +preprompt

	prompt=preprompt + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"
	long_context= "".join(chosen_sections)
	return prompt, long_context

	def answer_query_with_context(pre,query,df,document_embeddings, show_prompt= False):
	"""
	Takes the prompt and calls the Openai API and returns an answer.
	Note the parameters for the completion have been defined previously tempterature is set to 0 to avoid creative answers and
	max_tokes to 300 gives a reasonable long answer.
	"""
	prompt, long_context = construct_prompt(
	pre,
	query,
	document_embeddings,
	df
	)

	if show_prompt:
	print(prompt)

	response = openai.Completion.create(
	prompt=prompt,
	stop=[".", " END"],
	**COMPLETIONS_API_PARAMS
	)

	return long_context, response["choices"][0]["text"].strip(" \n")

	def embedding_storage_to_dict(path):
	df_prueba=pd.read_csv(path, engine="python")
	df_prueba.drop('Unnamed: 0', inplace=True, axis=1)
	df_prueba['section']=df_prueba['section'].apply(lambda x: eval(x))
	df_prueba['vector']=df_prueba['vector'].apply(lambda x: eval(x))
	sections_list=df_prueba.section
	vectors_list=df_prueba.vector

	embeddings_dictionary_from_storage={section:vector for section,vector in zip(sections_list,vectors_list)}
	return embeddings_dictionary_from_storage


	context_embeddings_calenda=embedding_storage_to_dict('Botlitica_Calenda_Vectors.csv')
	context_embeddings_letta=embedding_storage_to_dict('Botlitica_Letta_Vectors.csv')
	context_embeddings=embedding_storage_to_dict('Botlitica_Meloni_Vectors.csv')



	def greet(question,candidate):
	if candidate=='Meloni':
	context_embeddings_selected= context_embeddings
	df_selected= df
	pre="Rispondi alla domanda come se fossi Giorgia Meloni."





	if candidate=='Calenda':
	context_embeddings_selected= context_embeddings_calenda
	df_selected= df_calenda
	pre="Rispondi alla domanda come se fossi Carlo Calenda."



	if candidate=='Letta':
	context_embeddings_selected= context_embeddings_letta
	df_selected= df_letta
	pre="Rispondi alla domanda come se fossi Enrico Letta."



	#question=request.form['question']
	contexto,respuesta=answer_query_with_context(pre,question,df_selected,context_embeddings_selected, show_prompt= True)



	return respuesta, contexto





	css= """.gradio-container {
	background: linear-gradient(-45deg,
	#FF0000, #FFFFFF, #228B22);


	}


	"""


	#{background: linear-gradient(to right, #228B22, #FFFFFF, #FF0000);}


	with gr.Blocks(css=css) as demo:
	with gr.Row():
	#gr.Markdown(value="![](https://upload.wikimedia.org/wikipedia/commons/9/94/AXA_Logo.svg)", elem_id='imagen')
	gr.Markdown(
	"""
	# Botlitica!
	Botlitica è una AI conversazionale addestrata per rispondere alle vostre domande rispecchiando la propaganda politica sui social media (Twitter e Facebook) pre-elezioni condotta dai premiers di tre partiti:
	""")


	question = gr.Textbox(label="Domanda")

	with gr.Row():
	candidate= gr.Dropdown(
	["Meloni", "Calenda", "Letta"], label="Candidato")

	greet_btn = gr.Button("Chiedere")

	output=[gr.Textbox(lines=3, label='Generative AI risposta'), gr.Textbox(lines=3, label='Contesto utilizzato') ]
	#greet_btn = gr.Button("Submit")
	greet_btn.click(fn=greet, inputs=[question,candidate,], outputs=output, api_name="greet")
	# gr.Markdown(
	# """
	# Was this answer useful?
	# """)
	# with gr.Row():
	# feed_btn = gr.Button("Yes :)")
	# feed_btn_neg = gr.Button("No :(")





	demo.launch()