Spaces:

evegarcianz
/

Botlitica

Sleeping

App Files Files Community

evegarcianz commited on Aug 6, 2023

Commit

7e8112b

•

1 Parent(s): 1c6c050

Create app.py

Browse files

Files changed (1) hide show

app.py +286 -0

app.py ADDED Viewed

	@@ -0,0 +1,286 @@

+!pip install openai
+!pip install transformers
+!pip install -q gradio
+import pandas as pd
+import re
+import openai
+from openai.api_resources import engine
+import os
+openai.api_key="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"
+import pandas as pd
+import openai
+import numpy
+import numpy as np
+from transformers import GPT2TokenizerFast
+from numpy.linalg import norm
+tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+#Meloni dataframe
+df=pd.read_csv('roboGiorgia_3docsCleaner.csv')
+df.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
+df.set_index(['number', 'prompt'], inplace=True)
+df['tokens'] = df['content'].apply(lambda x: len(tokenizer.tokenize(x)))
+#Calenda dataframe
+df_calenda=pd.read_csv('roboCalenda_3docsCleaner_84prompting.csv')
+df_calenda.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
+df_calenda.set_index(['number', 'prompt'], inplace=True)
+df_calenda['tokens'] = df_calenda['content'].apply(lambda x: len(tokenizer.tokenize(x)))
+#Letta dataframe
+df_letta=pd.read_csv('roboLetta_3source_cleaner_84prompting.csv')
+df_letta.rename(columns={'Unnamed: 0': 'number', 'prompt': 'prompt', 'completion':'content'}, inplace=True)
+df_letta.set_index(['number', 'prompt'], inplace=True)
+df_letta['tokens'] = df_letta['content'].apply(lambda x: len(tokenizer.tokenize(x)))
+COMPLETIONS_MODEL = "text-davinci-003"
+COMPLETIONS_API_PARAMS = {
+    "temperature": 1,
+    "max_tokens": 300,
+    "model": COMPLETIONS_MODEL,
+}
+def get_embedding(text: str, model: str):
+    """
+    Create an embedding for any string passed using the OpenAI Embeddings API given a chosen GPT-3 model.
+    Return an embedding vector.
+    """
+    result = openai.Embedding.create(
+      model=model,
+      input=text
+    )
+    return result["data"][0]["embedding"]
+def get_doc_embedding(text):
+    """
+    This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions
+    from this file.
+    Return an embedding vector.
+    """
+    return get_embedding(text, 'text-embedding-ada-002')
+def get_query_embedding(text):
+    """
+    This function calls the previous function get_embedding, note that the model is hardcoded for simplicity when importing functions
+    from this file.
+    Return an embedding vector.
+    """
+    return get_embedding(text, 'text-embedding-ada-002')
+def compute_doc_embeddings(df: pd.DataFrame):
+    """
+    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
+    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
+    """
+    return {
+        idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows()
+    }
+def vector_similarity(x, y):
+    """
+    Cosine similarity
+    """
+    return np.dot(np.array(x), np.array(y))/ (norm(x)*norm(y))
+def order_document_sections_by_query_similarity(query, contexts):
+    """
+    Finds the query embedding and compares it against all of the pre-calculated document embeddings
+    to find the most relevant sections.
+    Return the list of document sections, sorted by relevance in descending order.
+    """
+    query_embedding = get_query_embedding(query)
+    document_similarities = sorted([
+        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
+    ], reverse=True)
+    return document_similarities
+def construct_prompt(pre,question, context_embeddings, df):
+    """
+    This function works based on the the function order_document_sections_by_query_similarity. It will construct a prompt using the
+    the most relevant sections of the document.
+    MAX_SECTION_LEN (in tokens) variable is key because it will control how many sections are concatenated in the prompt.
+    The function will stop joining sections once MAX_SECTION_LEN is reached.
+    header variable is also key because it will instruct clearly to answer only based on the context and how to answer if it does not know the answer.
+    This header is based on openai documentation.
+    https://beta.openai.com/docs/guides/fine-tuning/example-notebooks
+    Return the complete prompt and the long_context which is the union of the chosen most relevant sections.
+    """
+    MAX_SECTION_LEN = 1650
+    SEPARATOR = "\n* "
+    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+    separator_len = len(tokenizer.tokenize(SEPARATOR))
+    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
+    chosen_sections = []
+    chosen_sections_len = 0
+    chosen_sections_indexes = []
+    chosen_sections_links= []
+    for simi, section_index in most_relevant_document_sections:
+    #for _, section_index in most_relevant_document_sections:
+        # Add contexts until we run out of space.
+        document_section = df.loc[section_index]
+        chosen_sections_len += document_section.tokens + separator_len
+        if chosen_sections_len > MAX_SECTION_LEN:
+            break
+        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
+        #chosen_sections_indexes.append(str(section_index))
+        chosen_sections_indexes.append(str(simi)+' '+str(section_index))
+        #chosen_sections_links.append( document_section.link)
+    # Useful diagnostic information
+    print(f"Selected {len(chosen_sections)} document sections:")
+    #print("\n".join(chosen_sections_indexes))
+    preprompt = """"\n\nContext:\n"""
+    preprompt= pre +preprompt
+    prompt=preprompt + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"
+    long_context= "".join(chosen_sections)
+    return prompt, long_context
+def answer_query_with_context(pre,query,df,document_embeddings, show_prompt= False):
+    """
+    Takes the prompt and calls the Openai API and returns an answer.
+    Note the parameters for the completion have been defined previously tempterature is set to 0 to avoid creative answers and
+    max_tokes to 300 gives a reasonable long answer.
+    """
+    prompt, long_context = construct_prompt(
+        pre,
+        query,
+        document_embeddings,
+        df
+    )
+    if show_prompt:
+        print(prompt)
+    response = openai.Completion.create(
+                prompt=prompt,
+                stop=[".", " END"],
+                **COMPLETIONS_API_PARAMS
+            )
+    return long_context, response["choices"][0]["text"].strip(" \n")
+def embedding_storage_to_dict(path):
+    df_prueba=pd.read_csv(path, engine="python")
+    df_prueba.drop('Unnamed: 0', inplace=True, axis=1)
+    df_prueba['section']=df_prueba['section'].apply(lambda x: eval(x))
+    df_prueba['vector']=df_prueba['vector'].apply(lambda x: eval(x))
+    sections_list=df_prueba.section
+    vectors_list=df_prueba.vector
+    embeddings_dictionary_from_storage={section:vector for section,vector in zip(sections_list,vectors_list)}
+    return embeddings_dictionary_from_storage
+context_embeddings_calenda=embedding_storage_to_dict('Botlitica_Calenda_Vectors.csv')
+context_embeddings_letta=embedding_storage_to_dict('Botlitica_Letta_Vectors.csv')
+context_embeddings=embedding_storage_to_dict('Botlitica_Meloni_Vectors.csv')
+def greet(question,candidate):
+  if candidate=='Meloni':
+    context_embeddings_selected=  context_embeddings
+    df_selected= df
+    pre="Rispondi alla domanda come se fossi Giorgia Meloni."
+  if candidate=='Calenda':
+      context_embeddings_selected=  context_embeddings_calenda
+      df_selected= df_calenda
+      pre="Rispondi alla domanda come se fossi Carlo Calenda."
+  if candidate=='Letta':
+      context_embeddings_selected=  context_embeddings_letta
+      df_selected= df_letta
+      pre="Rispondi alla domanda come se fossi Enrico Letta."
+        #question=request.form['question']
+  contexto,respuesta=answer_query_with_context(pre,question,df_selected,context_embeddings_selected, show_prompt= True)
+  return contexto, respuesta
+import gradio as gr
+with gr.Blocks() as demo:
+    with gr.Row():
+        #gr.Markdown(value="![](https://upload.wikimedia.org/wikipedia/commons/9/94/AXA_Logo.svg)", elem_id='imagen')
+        gr.Markdown(
+        """
+        # Botlitica!
+        Botlitica è una AI conversazionale addestrata per rispondere alle vostre domande rispecchiando la propaganda politica sui social media (Twitter e Facebook) pre-elezioni condotta dai premiers di tre partiti:
+        """)
+    question = gr.Textbox(label="Question")
+    with gr.Row():
+        candidate= gr.Dropdown(
+            ["Meloni", "Calenda", "Letta"], label="Candidato")
+        # product= gr.Dropdown(
+        #     ["Motor", "Home"], label="Product")
+    greet_btn = gr.Button("Chiedere")
+    output=[gr.Textbox(lines=3, label='Context used'), gr.Textbox(lines=3, label='Generative AI response') ]
+    #greet_btn = gr.Button("Submit")
+    greet_btn.click(fn=greet, inputs=[question,candidate,], outputs=output, api_name="greet")
+    gr.Markdown(
+    """
+    # Was this answer useful?
+    """)
+    with gr.Row():
+        feed_btn = gr.Button("Yes :)")
+        feed_btn_neg = gr.Button("No :(")
+demo.launch()