File size: 9,564 Bytes
7e8112b
 
 
 
 
97860f0
c32259a
9843fc6
 
7e8112b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587687d
 
 
 
7e8112b
587687d
7e8112b
 
 
587687d
 
 
 
7e8112b
 
 
587687d
 
 
 
7e8112b
 
 
 
8723fba
7e8112b
587687d
 
8723fba
587687d
97860f0
7e8112b
 
 
4b2750d
 
4792934
fd2d595
6f160a1
4b2750d
 
fd2d595
c93a7e1
7e8112b
 
34a83e6
7e8112b
 
4b2750d
7e8112b
 
 
 
 
 
 
 
 
45d63b9
7e8112b
 
 
 
45d63b9
7e8112b
 
45d63b9
7e8112b
 
45d63b9
 
 
 
 
 
 
7e8112b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import pandas as pd
import re
import openai
from openai.api_resources import engine
import os
import gradio as gr

openai.api_key=os.environ['ed_key']
#openai.api_key=""
import pandas as pd
import openai
import numpy
import numpy as np
from transformers import GPT2TokenizerFast
from numpy.linalg import norm
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")



#Meloni dataframe
df=pd.read_csv('roboGiorgia_3docsCleaner.csv')
df.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
df.set_index(['number', 'prompt'], inplace=True)
df['tokens'] = df['content'].apply(lambda x: len(tokenizer.tokenize(x)))

#Calenda dataframe
df_calenda=pd.read_csv('roboCalenda_3docsCleaner_84prompting.csv')
df_calenda.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
df_calenda.set_index(['number', 'prompt'], inplace=True)
df_calenda['tokens'] = df_calenda['content'].apply(lambda x: len(tokenizer.tokenize(x)))

#Letta dataframe
df_letta=pd.read_csv('roboLetta_3source_cleaner_84prompting.csv')
df_letta.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
df_letta.set_index(['number', 'prompt'], inplace=True)
df_letta['tokens'] = df_letta['content'].apply(lambda x: len(tokenizer.tokenize(x)))

COMPLETIONS_MODEL = "text-davinci-003"

COMPLETIONS_API_PARAMS = {
    "temperature": 1,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,  
}

def get_embedding(text: str, model: str):
    """
    Create an embedding for any string passed using the OpenAI Embeddings API given a chosen GPT-3 model.
    
    Return an embedding vector.
    """
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def get_doc_embedding(text):
    """
    This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions 
    from this file.
    
    Return an embedding vector.
    """
    return get_embedding(text, 'text-embedding-ada-002')
    


def get_query_embedding(text):
    """
    This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions 
    from this file.
    
    Return an embedding vector.
    """
    return get_embedding(text, 'text-embedding-ada-002')
    


def compute_doc_embeddings(df: pd.DataFrame):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows()
    }

def vector_similarity(x, y):
    """
    Cosine similarity
     
    """
    return np.dot(np.array(x), np.array(y))/ (norm(x)*norm(y))

def order_document_sections_by_query_similarity(query, contexts):
    """
    Finds the query embedding and compares it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_query_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

def construct_prompt(pre,question, context_embeddings, df):
    """
    This function works based on the the function order_document_sections_by_query_similarity. It will construct a prompt using the 
    the most relevant sections of the document.
    
    MAX_SECTION_LEN (in tokens) variable is key because it will control how many sections are concatenated in the prompt.
    The function will stop joining sections once MAX_SECTION_LEN is reached.
    
    header variable is also key because it will instruct clearly to answer only based on the context and how to answer if it does not know the answer.
    This header is based on openai documentation.
    https://beta.openai.com/docs/guides/fine-tuning/example-notebooks
    
    Return the complete prompt and the long_context which is the union of the chosen most relevant sections.
    
    """
    MAX_SECTION_LEN = 1650
    SEPARATOR = "\n* "

    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    separator_len = len(tokenizer.tokenize(SEPARATOR))    
    
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
    chosen_sections_links= []
     
    for simi, section_index in most_relevant_document_sections:
    #for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        #chosen_sections_indexes.append(str(section_index))
        chosen_sections_indexes.append(str(simi)+' '+str(section_index))
        #chosen_sections_links.append( document_section.link)
        
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    #print("\n".join(chosen_sections_indexes))
        
    preprompt = """"\n\nContext:\n"""
    preprompt= pre +preprompt
    
    prompt=preprompt + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"
    long_context= "".join(chosen_sections)
    return prompt, long_context

def answer_query_with_context(pre,query,df,document_embeddings, show_prompt= False):
    """
    Takes the prompt and calls the Openai API and returns an answer.
    Note the parameters for the completion have been defined previously tempterature is set to 0 to avoid creative answers and
    max_tokes to 300 gives a reasonable long answer.
    """
    prompt, long_context = construct_prompt(
        pre,
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                stop=[".", " END"],
                **COMPLETIONS_API_PARAMS
            )

    return long_context, response["choices"][0]["text"].strip(" \n")

def embedding_storage_to_dict(path):
    df_prueba=pd.read_csv(path, engine="python")
    df_prueba.drop('Unnamed: 0', inplace=True, axis=1)
    df_prueba['section']=df_prueba['section'].apply(lambda x: eval(x))
    df_prueba['vector']=df_prueba['vector'].apply(lambda x: eval(x))
    sections_list=df_prueba.section
    vectors_list=df_prueba.vector
    
    embeddings_dictionary_from_storage={section:vector for section,vector in zip(sections_list,vectors_list)}
    return embeddings_dictionary_from_storage


context_embeddings_calenda=embedding_storage_to_dict('Botlitica_Calenda_Vectors.csv')
context_embeddings_letta=embedding_storage_to_dict('Botlitica_Letta_Vectors.csv')
context_embeddings=embedding_storage_to_dict('Botlitica_Meloni_Vectors.csv')



def greet(question,candidate):
    if candidate=='Meloni':
        context_embeddings_selected=  context_embeddings
        df_selected= df
        pre="Rispondi alla domanda come se fossi Giorgia Meloni."

    

  

    if candidate=='Calenda':
        context_embeddings_selected=  context_embeddings_calenda
        df_selected= df_calenda
        pre="Rispondi alla domanda come se fossi Carlo Calenda."
            
            
    
    if candidate=='Letta':
        context_embeddings_selected=  context_embeddings_letta
        df_selected= df_letta
        pre="Rispondi alla domanda come se fossi Enrico Letta."
            
            
        
        #question=request.form['question']
    contexto,respuesta=answer_query_with_context(pre,question,df_selected,context_embeddings_selected, show_prompt= True)



    return  respuesta, contexto
    




css= """.gradio-container {
  background: linear-gradient(-45deg, 
                                 #FF0000, #FFFFFF, #228B22);
  
  
}


        """


#{background: linear-gradient(to right, #228B22, #FFFFFF, #FF0000);}


with gr.Blocks(css=css) as demo:
    with gr.Row():
        #gr.Markdown(value="![](https://upload.wikimedia.org/wikipedia/commons/9/94/AXA_Logo.svg)", elem_id='imagen')
        gr.Markdown(
        """
        # Botlitica!
        Botlitica è una AI conversazionale addestrata per rispondere alle vostre domande rispecchiando la propaganda politica sui social media (Twitter e Facebook) pre-elezioni condotta dai premiers di tre partiti:
        """)


    question = gr.Textbox(label="Domanda")

    with gr.Row():
        candidate= gr.Dropdown(
            ["Meloni", "Calenda", "Letta"], label="Candidato")
        
    greet_btn = gr.Button("Chiedere")

    output=[gr.Textbox(lines=3, label='Generative AI risposta'), gr.Textbox(lines=3, label='Contesto utilizzato') ]
    #greet_btn = gr.Button("Submit")
    greet_btn.click(fn=greet, inputs=[question,candidate,], outputs=output, api_name="greet")
    # gr.Markdown(
    # """
    #  Was this answer useful?
    # """)
    # with gr.Row():
    #     feed_btn = gr.Button("Yes :)")
    #     feed_btn_neg = gr.Button("No :(")





demo.launch()