Spaces:
Sleeping
Sleeping
import pandas as pd | |
import re | |
import openai | |
from openai.api_resources import engine | |
import os | |
import gradio as gr | |
#openai.api_key=os.environ['ed_key'] | |
openai.api_key="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" | |
import pandas as pd | |
import openai | |
import numpy | |
import numpy as np | |
from transformers import GPT2TokenizerFast | |
from numpy.linalg import norm | |
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
#Meloni dataframe | |
df=pd.read_csv('roboGiorgia_3docsCleaner.csv') | |
df.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True) | |
df.set_index(['number', 'prompt'], inplace=True) | |
df['tokens'] = df['content'].apply(lambda x: len(tokenizer.tokenize(x))) | |
#Calenda dataframe | |
df_calenda=pd.read_csv('roboCalenda_3docsCleaner_84prompting.csv') | |
df_calenda.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True) | |
df_calenda.set_index(['number', 'prompt'], inplace=True) | |
df_calenda['tokens'] = df_calenda['content'].apply(lambda x: len(tokenizer.tokenize(x))) | |
#Letta dataframe | |
df_letta=pd.read_csv('roboLetta_3source_cleaner_84prompting.csv') | |
df_letta.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True) | |
df_letta.set_index(['number', 'prompt'], inplace=True) | |
df_letta['tokens'] = df_letta['content'].apply(lambda x: len(tokenizer.tokenize(x))) | |
COMPLETIONS_MODEL = "text-davinci-003" | |
COMPLETIONS_API_PARAMS = { | |
"temperature": 1, | |
"max_tokens": 300, | |
"model": COMPLETIONS_MODEL, | |
} | |
def get_embedding(text: str, model: str): | |
""" | |
Create an embedding for any string passed using the OpenAI Embeddings API given a chosen GPT-3 model. | |
Return an embedding vector. | |
""" | |
result = openai.Embedding.create( | |
model=model, | |
input=text | |
) | |
return result["data"][0]["embedding"] | |
def get_doc_embedding(text): | |
""" | |
This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions | |
from this file. | |
Return an embedding vector. | |
""" | |
return get_embedding(text, 'text-embedding-ada-002') | |
def get_query_embedding(text): | |
""" | |
This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions | |
from this file. | |
Return an embedding vector. | |
""" | |
return get_embedding(text, 'text-embedding-ada-002') | |
def compute_doc_embeddings(df: pd.DataFrame): | |
""" | |
Create an embedding for each row in the dataframe using the OpenAI Embeddings API. | |
Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to. | |
""" | |
return { | |
idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows() | |
} | |
def vector_similarity(x, y): | |
""" | |
Cosine similarity | |
""" | |
return np.dot(np.array(x), np.array(y))/ (norm(x)*norm(y)) | |
def order_document_sections_by_query_similarity(query, contexts): | |
""" | |
Finds the query embedding and compares it against all of the pre-calculated document embeddings | |
to find the most relevant sections. | |
Return the list of document sections, sorted by relevance in descending order. | |
""" | |
query_embedding = get_query_embedding(query) | |
document_similarities = sorted([ | |
(vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items() | |
], reverse=True) | |
return document_similarities | |
def construct_prompt(pre,question, context_embeddings, df): | |
""" | |
This function works based on the the function order_document_sections_by_query_similarity. It will construct a prompt using the | |
the most relevant sections of the document. | |
MAX_SECTION_LEN (in tokens) variable is key because it will control how many sections are concatenated in the prompt. | |
The function will stop joining sections once MAX_SECTION_LEN is reached. | |
header variable is also key because it will instruct clearly to answer only based on the context and how to answer if it does not know the answer. | |
This header is based on openai documentation. | |
https://beta.openai.com/docs/guides/fine-tuning/example-notebooks | |
Return the complete prompt and the long_context which is the union of the chosen most relevant sections. | |
""" | |
MAX_SECTION_LEN = 1650 | |
SEPARATOR = "\n* " | |
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
separator_len = len(tokenizer.tokenize(SEPARATOR)) | |
most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings) | |
chosen_sections = [] | |
chosen_sections_len = 0 | |
chosen_sections_indexes = [] | |
chosen_sections_links= [] | |
for simi, section_index in most_relevant_document_sections: | |
#for _, section_index in most_relevant_document_sections: | |
# Add contexts until we run out of space. | |
document_section = df.loc[section_index] | |
chosen_sections_len += document_section.tokens + separator_len | |
if chosen_sections_len > MAX_SECTION_LEN: | |
break | |
chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " ")) | |
#chosen_sections_indexes.append(str(section_index)) | |
chosen_sections_indexes.append(str(simi)+' '+str(section_index)) | |
#chosen_sections_links.append( document_section.link) | |
# Useful diagnostic information | |
print(f"Selected {len(chosen_sections)} document sections:") | |
#print("\n".join(chosen_sections_indexes)) | |
preprompt = """"\n\nContext:\n""" | |
preprompt= pre +preprompt | |
prompt=preprompt + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:" | |
long_context= "".join(chosen_sections) | |
return prompt, long_context | |
def answer_query_with_context(pre,query,df,document_embeddings, show_prompt= False): | |
""" | |
Takes the prompt and calls the Openai API and returns an answer. | |
Note the parameters for the completion have been defined previously tempterature is set to 0 to avoid creative answers and | |
max_tokes to 300 gives a reasonable long answer. | |
""" | |
prompt, long_context = construct_prompt( | |
pre, | |
query, | |
document_embeddings, | |
df | |
) | |
if show_prompt: | |
print(prompt) | |
response = openai.Completion.create( | |
prompt=prompt, | |
stop=[".", " END"], | |
**COMPLETIONS_API_PARAMS | |
) | |
return long_context, response["choices"][0]["text"].strip(" \n") | |
def embedding_storage_to_dict(path): | |
df_prueba=pd.read_csv(path, engine="python") | |
df_prueba.drop('Unnamed: 0', inplace=True, axis=1) | |
df_prueba['section']=df_prueba['section'].apply(lambda x: eval(x)) | |
df_prueba['vector']=df_prueba['vector'].apply(lambda x: eval(x)) | |
sections_list=df_prueba.section | |
vectors_list=df_prueba.vector | |
embeddings_dictionary_from_storage={section:vector for section,vector in zip(sections_list,vectors_list)} | |
return embeddings_dictionary_from_storage | |
context_embeddings_calenda=embedding_storage_to_dict('Botlitica_Calenda_Vectors.csv') | |
context_embeddings_letta=embedding_storage_to_dict('Botlitica_Letta_Vectors.csv') | |
context_embeddings=embedding_storage_to_dict('Botlitica_Meloni_Vectors.csv') | |
def greet(question,candidate): | |
if candidate=='Meloni': | |
context_embeddings_selected= context_embeddings | |
df_selected= df | |
pre="Rispondi alla domanda come se fossi Giorgia Meloni." | |
if candidate=='Calenda': | |
context_embeddings_selected= context_embeddings_calenda | |
df_selected= df_calenda | |
pre="Rispondi alla domanda come se fossi Carlo Calenda." | |
if candidate=='Letta': | |
context_embeddings_selected= context_embeddings_letta | |
df_selected= df_letta | |
pre="Rispondi alla domanda come se fossi Enrico Letta." | |
#question=request.form['question'] | |
contexto,respuesta=answer_query_with_context(pre,question,df_selected,context_embeddings_selected, show_prompt= True) | |
return respuesta, contexto | |
css= """@import url(https://fonts.googleapis.com/css?family=Roboto:400,300,600,400italic); | |
body {background: linear-gradient(to right, #228B22, #FFFFFF, #FF0000);} | |
h3 { | |
text-align: center; | |
font-family: "Roboto", Helvetica, Arial, sans-serif; | |
color: rgb(1, 1, 1); | |
} | |
button { | |
background-color: #a8d4e2; /* Italian flag blue color */ | |
color: white; | |
padding: 10px 20px; | |
border: 0; | |
border-radius: 5px; | |
cursor: pointer; | |
font-size: 16px; | |
display: block; | |
margin: auto; | |
} | |
.center {{ | |
text-align: center; | |
display: flex; | |
flex-direction: column; | |
justify-content: center; | |
align-items: center; | |
height: 100vh; | |
}} | |
""" | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
#gr.Markdown(value="![](https://upload.wikimedia.org/wikipedia/commons/9/94/AXA_Logo.svg)", elem_id='imagen') | |
gr.Markdown( | |
""" | |
# Botlitica! | |
Botlitica è una AI conversazionale addestrata per rispondere alle vostre domande rispecchiando la propaganda politica sui social media (Twitter e Facebook) pre-elezioni condotta dai premiers di tre partiti: | |
""") | |
question = gr.Textbox(label="Question") | |
with gr.Row(): | |
candidate= gr.Dropdown( | |
["Meloni", "Calenda", "Letta"], label="Candidato") | |
# product= gr.Dropdown( | |
# ["Motor", "Home"], label="Product") | |
greet_btn = gr.Button("Chiedere") | |
output=[gr.Textbox(lines=3, label='Generative AI response'), gr.Textbox(lines=3, label='Context used') ] | |
#greet_btn = gr.Button("Submit") | |
greet_btn.click(fn=greet, inputs=[question,candidate,], outputs=output, api_name="greet") | |
gr.Markdown( | |
""" | |
# Was this answer useful? | |
""") | |
with gr.Row(): | |
feed_btn = gr.Button("Yes :)") | |
feed_btn_neg = gr.Button("No :(") | |
demo.launch() |