Spaces:
Sleeping
Sleeping
File size: 9,564 Bytes
7e8112b 97860f0 c32259a 9843fc6 7e8112b 587687d 7e8112b 587687d 7e8112b 587687d 7e8112b 587687d 7e8112b 8723fba 7e8112b 587687d 8723fba 587687d 97860f0 7e8112b 4b2750d 4792934 fd2d595 6f160a1 4b2750d fd2d595 c93a7e1 7e8112b 34a83e6 7e8112b 4b2750d 7e8112b 45d63b9 7e8112b 45d63b9 7e8112b 45d63b9 7e8112b 45d63b9 7e8112b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 |
import pandas as pd
import re
import openai
from openai.api_resources import engine
import os
import gradio as gr
openai.api_key=os.environ['ed_key']
#openai.api_key=""
import pandas as pd
import openai
import numpy
import numpy as np
from transformers import GPT2TokenizerFast
from numpy.linalg import norm
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
#Meloni dataframe
df=pd.read_csv('roboGiorgia_3docsCleaner.csv')
df.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
df.set_index(['number', 'prompt'], inplace=True)
df['tokens'] = df['content'].apply(lambda x: len(tokenizer.tokenize(x)))
#Calenda dataframe
df_calenda=pd.read_csv('roboCalenda_3docsCleaner_84prompting.csv')
df_calenda.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
df_calenda.set_index(['number', 'prompt'], inplace=True)
df_calenda['tokens'] = df_calenda['content'].apply(lambda x: len(tokenizer.tokenize(x)))
#Letta dataframe
df_letta=pd.read_csv('roboLetta_3source_cleaner_84prompting.csv')
df_letta.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
df_letta.set_index(['number', 'prompt'], inplace=True)
df_letta['tokens'] = df_letta['content'].apply(lambda x: len(tokenizer.tokenize(x)))
COMPLETIONS_MODEL = "text-davinci-003"
COMPLETIONS_API_PARAMS = {
"temperature": 1,
"max_tokens": 300,
"model": COMPLETIONS_MODEL,
}
def get_embedding(text: str, model: str):
"""
Create an embedding for any string passed using the OpenAI Embeddings API given a chosen GPT-3 model.
Return an embedding vector.
"""
result = openai.Embedding.create(
model=model,
input=text
)
return result["data"][0]["embedding"]
def get_doc_embedding(text):
"""
This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions
from this file.
Return an embedding vector.
"""
return get_embedding(text, 'text-embedding-ada-002')
def get_query_embedding(text):
"""
This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions
from this file.
Return an embedding vector.
"""
return get_embedding(text, 'text-embedding-ada-002')
def compute_doc_embeddings(df: pd.DataFrame):
"""
Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
"""
return {
idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows()
}
def vector_similarity(x, y):
"""
Cosine similarity
"""
return np.dot(np.array(x), np.array(y))/ (norm(x)*norm(y))
def order_document_sections_by_query_similarity(query, contexts):
"""
Finds the query embedding and compares it against all of the pre-calculated document embeddings
to find the most relevant sections.
Return the list of document sections, sorted by relevance in descending order.
"""
query_embedding = get_query_embedding(query)
document_similarities = sorted([
(vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
], reverse=True)
return document_similarities
def construct_prompt(pre,question, context_embeddings, df):
"""
This function works based on the the function order_document_sections_by_query_similarity. It will construct a prompt using the
the most relevant sections of the document.
MAX_SECTION_LEN (in tokens) variable is key because it will control how many sections are concatenated in the prompt.
The function will stop joining sections once MAX_SECTION_LEN is reached.
header variable is also key because it will instruct clearly to answer only based on the context and how to answer if it does not know the answer.
This header is based on openai documentation.
https://beta.openai.com/docs/guides/fine-tuning/example-notebooks
Return the complete prompt and the long_context which is the union of the chosen most relevant sections.
"""
MAX_SECTION_LEN = 1650
SEPARATOR = "\n* "
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
separator_len = len(tokenizer.tokenize(SEPARATOR))
most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
chosen_sections = []
chosen_sections_len = 0
chosen_sections_indexes = []
chosen_sections_links= []
for simi, section_index in most_relevant_document_sections:
#for _, section_index in most_relevant_document_sections:
# Add contexts until we run out of space.
document_section = df.loc[section_index]
chosen_sections_len += document_section.tokens + separator_len
if chosen_sections_len > MAX_SECTION_LEN:
break
chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
#chosen_sections_indexes.append(str(section_index))
chosen_sections_indexes.append(str(simi)+' '+str(section_index))
#chosen_sections_links.append( document_section.link)
# Useful diagnostic information
print(f"Selected {len(chosen_sections)} document sections:")
#print("\n".join(chosen_sections_indexes))
preprompt = """"\n\nContext:\n"""
preprompt= pre +preprompt
prompt=preprompt + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"
long_context= "".join(chosen_sections)
return prompt, long_context
def answer_query_with_context(pre,query,df,document_embeddings, show_prompt= False):
"""
Takes the prompt and calls the Openai API and returns an answer.
Note the parameters for the completion have been defined previously tempterature is set to 0 to avoid creative answers and
max_tokes to 300 gives a reasonable long answer.
"""
prompt, long_context = construct_prompt(
pre,
query,
document_embeddings,
df
)
if show_prompt:
print(prompt)
response = openai.Completion.create(
prompt=prompt,
stop=[".", " END"],
**COMPLETIONS_API_PARAMS
)
return long_context, response["choices"][0]["text"].strip(" \n")
def embedding_storage_to_dict(path):
df_prueba=pd.read_csv(path, engine="python")
df_prueba.drop('Unnamed: 0', inplace=True, axis=1)
df_prueba['section']=df_prueba['section'].apply(lambda x: eval(x))
df_prueba['vector']=df_prueba['vector'].apply(lambda x: eval(x))
sections_list=df_prueba.section
vectors_list=df_prueba.vector
embeddings_dictionary_from_storage={section:vector for section,vector in zip(sections_list,vectors_list)}
return embeddings_dictionary_from_storage
context_embeddings_calenda=embedding_storage_to_dict('Botlitica_Calenda_Vectors.csv')
context_embeddings_letta=embedding_storage_to_dict('Botlitica_Letta_Vectors.csv')
context_embeddings=embedding_storage_to_dict('Botlitica_Meloni_Vectors.csv')
def greet(question,candidate):
if candidate=='Meloni':
context_embeddings_selected= context_embeddings
df_selected= df
pre="Rispondi alla domanda come se fossi Giorgia Meloni."
if candidate=='Calenda':
context_embeddings_selected= context_embeddings_calenda
df_selected= df_calenda
pre="Rispondi alla domanda come se fossi Carlo Calenda."
if candidate=='Letta':
context_embeddings_selected= context_embeddings_letta
df_selected= df_letta
pre="Rispondi alla domanda come se fossi Enrico Letta."
#question=request.form['question']
contexto,respuesta=answer_query_with_context(pre,question,df_selected,context_embeddings_selected, show_prompt= True)
return respuesta, contexto
css= """.gradio-container {
background: linear-gradient(-45deg,
#FF0000, #FFFFFF, #228B22);
}
"""
#{background: linear-gradient(to right, #228B22, #FFFFFF, #FF0000);}
with gr.Blocks(css=css) as demo:
with gr.Row():
#gr.Markdown(value="![](https://upload.wikimedia.org/wikipedia/commons/9/94/AXA_Logo.svg)", elem_id='imagen')
gr.Markdown(
"""
# Botlitica!
Botlitica è una AI conversazionale addestrata per rispondere alle vostre domande rispecchiando la propaganda politica sui social media (Twitter e Facebook) pre-elezioni condotta dai premiers di tre partiti:
""")
question = gr.Textbox(label="Domanda")
with gr.Row():
candidate= gr.Dropdown(
["Meloni", "Calenda", "Letta"], label="Candidato")
greet_btn = gr.Button("Chiedere")
output=[gr.Textbox(lines=3, label='Generative AI risposta'), gr.Textbox(lines=3, label='Contesto utilizzato') ]
#greet_btn = gr.Button("Submit")
greet_btn.click(fn=greet, inputs=[question,candidate,], outputs=output, api_name="greet")
# gr.Markdown(
# """
# Was this answer useful?
# """)
# with gr.Row():
# feed_btn = gr.Button("Yes :)")
# feed_btn_neg = gr.Button("No :(")
demo.launch() |