evegarcianz commited on
Commit
7e8112b
1 Parent(s): 1c6c050

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +286 -0
app.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install openai
2
+ !pip install transformers
3
+ !pip install -q gradio
4
+
5
+
6
+ import pandas as pd
7
+ import re
8
+ import openai
9
+ from openai.api_resources import engine
10
+ import os
11
+ openai.api_key="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"
12
+
13
+ import pandas as pd
14
+ import openai
15
+ import numpy
16
+ import numpy as np
17
+ from transformers import GPT2TokenizerFast
18
+ from numpy.linalg import norm
19
+ tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
20
+
21
+
22
+
23
+
24
+ #Meloni dataframe
25
+ df=pd.read_csv('roboGiorgia_3docsCleaner.csv')
26
+ df.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
27
+ df.set_index(['number', 'prompt'], inplace=True)
28
+ df['tokens'] = df['content'].apply(lambda x: len(tokenizer.tokenize(x)))
29
+
30
+ #Calenda dataframe
31
+ df_calenda=pd.read_csv('roboCalenda_3docsCleaner_84prompting.csv')
32
+ df_calenda.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
33
+ df_calenda.set_index(['number', 'prompt'], inplace=True)
34
+ df_calenda['tokens'] = df_calenda['content'].apply(lambda x: len(tokenizer.tokenize(x)))
35
+
36
+ #Letta dataframe
37
+ df_letta=pd.read_csv('roboLetta_3source_cleaner_84prompting.csv')
38
+ df_letta.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
39
+ df_letta.set_index(['number', 'prompt'], inplace=True)
40
+ df_letta['tokens'] = df_letta['content'].apply(lambda x: len(tokenizer.tokenize(x)))
41
+
42
+ COMPLETIONS_MODEL = "text-davinci-003"
43
+
44
+ COMPLETIONS_API_PARAMS = {
45
+ "temperature": 1,
46
+ "max_tokens": 300,
47
+ "model": COMPLETIONS_MODEL,
48
+ }
49
+
50
+ def get_embedding(text: str, model: str):
51
+ """
52
+ Create an embedding for any string passed using the OpenAI Embeddings API given a chosen GPT-3 model.
53
+
54
+ Return an embedding vector.
55
+ """
56
+ result = openai.Embedding.create(
57
+ model=model,
58
+ input=text
59
+ )
60
+ return result["data"][0]["embedding"]
61
+
62
+ def get_doc_embedding(text):
63
+ """
64
+ This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions
65
+ from this file.
66
+
67
+ Return an embedding vector.
68
+ """
69
+ return get_embedding(text, 'text-embedding-ada-002')
70
+
71
+
72
+
73
+ def get_query_embedding(text):
74
+ """
75
+ This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions
76
+ from this file.
77
+
78
+ Return an embedding vector.
79
+ """
80
+ return get_embedding(text, 'text-embedding-ada-002')
81
+
82
+
83
+
84
+ def compute_doc_embeddings(df: pd.DataFrame):
85
+ """
86
+ Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
87
+
88
+ Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
89
+ """
90
+ return {
91
+ idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows()
92
+ }
93
+
94
+ def vector_similarity(x, y):
95
+ """
96
+ Cosine similarity
97
+
98
+ """
99
+ return np.dot(np.array(x), np.array(y))/ (norm(x)*norm(y))
100
+
101
+ def order_document_sections_by_query_similarity(query, contexts):
102
+ """
103
+ Finds the query embedding and compares it against all of the pre-calculated document embeddings
104
+ to find the most relevant sections.
105
+
106
+ Return the list of document sections, sorted by relevance in descending order.
107
+ """
108
+ query_embedding = get_query_embedding(query)
109
+
110
+ document_similarities = sorted([
111
+ (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
112
+ ], reverse=True)
113
+
114
+ return document_similarities
115
+
116
+ def construct_prompt(pre,question, context_embeddings, df):
117
+ """
118
+ This function works based on the the function order_document_sections_by_query_similarity. It will construct a prompt using the
119
+ the most relevant sections of the document.
120
+
121
+ MAX_SECTION_LEN (in tokens) variable is key because it will control how many sections are concatenated in the prompt.
122
+ The function will stop joining sections once MAX_SECTION_LEN is reached.
123
+
124
+ header variable is also key because it will instruct clearly to answer only based on the context and how to answer if it does not know the answer.
125
+ This header is based on openai documentation.
126
+ https://beta.openai.com/docs/guides/fine-tuning/example-notebooks
127
+
128
+ Return the complete prompt and the long_context which is the union of the chosen most relevant sections.
129
+
130
+ """
131
+ MAX_SECTION_LEN = 1650
132
+ SEPARATOR = "\n* "
133
+
134
+ tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
135
+ separator_len = len(tokenizer.tokenize(SEPARATOR))
136
+
137
+ most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
138
+
139
+ chosen_sections = []
140
+ chosen_sections_len = 0
141
+ chosen_sections_indexes = []
142
+ chosen_sections_links= []
143
+
144
+ for simi, section_index in most_relevant_document_sections:
145
+ #for _, section_index in most_relevant_document_sections:
146
+ # Add contexts until we run out of space.
147
+ document_section = df.loc[section_index]
148
+
149
+ chosen_sections_len += document_section.tokens + separator_len
150
+ if chosen_sections_len > MAX_SECTION_LEN:
151
+ break
152
+
153
+ chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
154
+ #chosen_sections_indexes.append(str(section_index))
155
+ chosen_sections_indexes.append(str(simi)+' '+str(section_index))
156
+ #chosen_sections_links.append( document_section.link)
157
+
158
+
159
+ # Useful diagnostic information
160
+ print(f"Selected {len(chosen_sections)} document sections:")
161
+ #print("\n".join(chosen_sections_indexes))
162
+
163
+ preprompt = """"\n\nContext:\n"""
164
+ preprompt= pre +preprompt
165
+
166
+ prompt=preprompt + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"
167
+ long_context= "".join(chosen_sections)
168
+ return prompt, long_context
169
+
170
+ def answer_query_with_context(pre,query,df,document_embeddings, show_prompt= False):
171
+ """
172
+ Takes the prompt and calls the Openai API and returns an answer.
173
+ Note the parameters for the completion have been defined previously tempterature is set to 0 to avoid creative answers and
174
+ max_tokes to 300 gives a reasonable long answer.
175
+ """
176
+ prompt, long_context = construct_prompt(
177
+ pre,
178
+ query,
179
+ document_embeddings,
180
+ df
181
+ )
182
+
183
+ if show_prompt:
184
+ print(prompt)
185
+
186
+ response = openai.Completion.create(
187
+ prompt=prompt,
188
+ stop=[".", " END"],
189
+ **COMPLETIONS_API_PARAMS
190
+ )
191
+
192
+ return long_context, response["choices"][0]["text"].strip(" \n")
193
+
194
+ def embedding_storage_to_dict(path):
195
+ df_prueba=pd.read_csv(path, engine="python")
196
+ df_prueba.drop('Unnamed: 0', inplace=True, axis=1)
197
+ df_prueba['section']=df_prueba['section'].apply(lambda x: eval(x))
198
+ df_prueba['vector']=df_prueba['vector'].apply(lambda x: eval(x))
199
+ sections_list=df_prueba.section
200
+ vectors_list=df_prueba.vector
201
+
202
+ embeddings_dictionary_from_storage={section:vector for section,vector in zip(sections_list,vectors_list)}
203
+ return embeddings_dictionary_from_storage
204
+
205
+
206
+ context_embeddings_calenda=embedding_storage_to_dict('Botlitica_Calenda_Vectors.csv')
207
+ context_embeddings_letta=embedding_storage_to_dict('Botlitica_Letta_Vectors.csv')
208
+ context_embeddings=embedding_storage_to_dict('Botlitica_Meloni_Vectors.csv')
209
+
210
+
211
+
212
+ def greet(question,candidate):
213
+
214
+ if candidate=='Meloni':
215
+ context_embeddings_selected= context_embeddings
216
+ df_selected= df
217
+ pre="Rispondi alla domanda come se fossi Giorgia Meloni."
218
+
219
+
220
+
221
+
222
+
223
+ if candidate=='Calenda':
224
+ context_embeddings_selected= context_embeddings_calenda
225
+ df_selected= df_calenda
226
+ pre="Rispondi alla domanda come se fossi Carlo Calenda."
227
+
228
+
229
+
230
+ if candidate=='Letta':
231
+ context_embeddings_selected= context_embeddings_letta
232
+ df_selected= df_letta
233
+ pre="Rispondi alla domanda come se fossi Enrico Letta."
234
+
235
+
236
+
237
+ #question=request.form['question']
238
+ contexto,respuesta=answer_query_with_context(pre,question,df_selected,context_embeddings_selected, show_prompt= True)
239
+
240
+ return contexto, respuesta
241
+
242
+
243
+ import gradio as gr
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+ with gr.Blocks() as demo:
253
+ with gr.Row():
254
+ #gr.Markdown(value="![](https://upload.wikimedia.org/wikipedia/commons/9/94/AXA_Logo.svg)", elem_id='imagen')
255
+ gr.Markdown(
256
+ """
257
+ # Botlitica!
258
+ Botlitica è una AI conversazionale addestrata per rispondere alle vostre domande rispecchiando la propaganda politica sui social media (Twitter e Facebook) pre-elezioni condotta dai premiers di tre partiti:
259
+ """)
260
+
261
+
262
+ question = gr.Textbox(label="Question")
263
+
264
+ with gr.Row():
265
+ candidate= gr.Dropdown(
266
+ ["Meloni", "Calenda", "Letta"], label="Candidato")
267
+ # product= gr.Dropdown(
268
+ # ["Motor", "Home"], label="Product")
269
+ greet_btn = gr.Button("Chiedere")
270
+
271
+ output=[gr.Textbox(lines=3, label='Context used'), gr.Textbox(lines=3, label='Generative AI response') ]
272
+ #greet_btn = gr.Button("Submit")
273
+ greet_btn.click(fn=greet, inputs=[question,candidate,], outputs=output, api_name="greet")
274
+ gr.Markdown(
275
+ """
276
+ # Was this answer useful?
277
+ """)
278
+ with gr.Row():
279
+ feed_btn = gr.Button("Yes :)")
280
+ feed_btn_neg = gr.Button("No :(")
281
+
282
+
283
+
284
+
285
+
286
+ demo.launch()