Spaces:
Runtime error
Runtime error
import gradio as gr | |
from wordcloud import WordCloud | |
import pandas as pd | |
import requests | |
import json | |
import hopsworks | |
import matplotlib.pyplot as plt | |
import os | |
import time | |
MODEL = "gpt-3.5-turbo" | |
API_URL = "https://api.openai.com/v1/chat/completions" | |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
project = hopsworks.login(project="ServerlessIntroIris") | |
fs = project.get_feature_store() | |
dataset1 = fs.get_feature_group(name="daily_topic_info").read() | |
df = dataset1 | |
dataset2 = fs.get_feature_group(name="daily_document_info").read() | |
df2 = dataset2 | |
topics = df['topic'].unique() | |
def gpt_predict(inputs, request:gr.Request=gr.State([]), top_p = 1, temperature = 1, chat_counter = 0,history =[]): | |
payload = { | |
"model": MODEL, | |
"messages": [{"role": "user", "content": f"{inputs}"}], | |
"temperature" : 1.0, | |
"top_p":1.0, | |
"n" : 1, | |
"stream": True, | |
"presence_penalty":0, | |
"frequency_penalty":0, | |
} | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {OPENAI_API_KEY}", | |
} | |
# print(f"chat_counter - {chat_counter}") | |
if chat_counter != 0 : | |
messages = [] | |
for i, data in enumerate(history): | |
if i % 2 == 0: | |
role = 'user' | |
else: | |
role = 'assistant' | |
message = {} | |
message["role"] = role | |
message["content"] = data | |
messages.append(message) | |
message = conversation_history | |
messages.append(message) | |
payload = { | |
"model": MODEL, | |
"messages": messages, | |
"temperature" : temperature, | |
"top_p": top_p, | |
"n" : 1, | |
"stream": True, | |
"presence_penalty":0, | |
"frequency_penalty":0, | |
} | |
chat_counter += 1 | |
history.append(inputs) | |
token_counter = 0 | |
partial_words = "" | |
counter = 0 | |
try: | |
# make a POST request to the API endpoint using the requests.post method, passing in stream=True | |
response = requests.post(API_URL, headers=headers, json=payload, stream=True) | |
response_code = f"{response}" | |
if response_code.strip() != "<Response [200]>": | |
#print(f"response code - {response}") | |
raise Exception(f"Sorry, hitting rate limit. Please try again later. {response}") | |
out = [] | |
for chunk in response.iter_lines(): | |
#Skipping first chunk | |
if counter == 0: | |
counter += 1 | |
continue | |
#counter+=1 | |
# check whether each line is non-empty | |
if chunk.decode() : | |
chunk = chunk.decode() | |
# decode each line as response data is in bytes | |
if len(chunk) > 12 and "content" in json.loads(chunk[6:])['choices'][0]['delta']: | |
partial_words = partial_words + json.loads(chunk[6:])['choices'][0]["delta"]["content"] | |
if token_counter == 0: | |
history.append(" " + partial_words) | |
else: | |
history[-1] = partial_words | |
token_counter += 1 | |
except Exception as e: | |
print (f'error found: {e}') | |
return partial_words | |
readable_topics_dic = dict() | |
input = "I have lists of multiple words : " | |
mrk = [] #most representative keyword (used if chatgpt doesn't work) | |
for t in topics.tolist(): | |
if t != -1: | |
selected_data = df[df['topic'] == t] | |
keywords = selected_data['keywords'][selected_data.index[0]] | |
freq = selected_data["scores"][selected_data.index[0]] | |
keyword_freq_pairs = zip(keywords, freq) | |
most_frequent_keyword = max(keyword_freq_pairs, key=lambda x: x[1]) | |
print(most_frequent_keyword[0].capitalize()) | |
mrk.append(most_frequent_keyword[0].capitalize()) | |
input += ", [" + ", ".join(keywords) + "]" | |
input += " I want you to give me only one precise word that best describes the theme of this list. If I give you multiple lists, I want you to give me one word with a maj in front for each of those lists, and separate them by // (your answer should contain only one word for each, if I give you 100 lists, You give me 100 words)" | |
new_topics = "".join(gpt_predict(input)) | |
nt = new_topics.split("//") | |
#in case chatgpt overloaded | |
i = 0 | |
if len(nt) < len(topics.tolist()): | |
nt = [f"Topic {o+1}: {mrk[o]}" for o in range(len(topics.tolist())-1)] | |
for t in topics.tolist(): | |
if t != -1: | |
readable_topics_dic[nt[i]] = t | |
if i < len(nt)-1: | |
i += 1 | |
def display_topics(topic): | |
topic = readable_topics_dic[topic] | |
# Filter DataFrame based on the selected topic | |
selected_data = df[df['topic'] == topic] | |
selected_data2 = df2[df2['topic'] == topic] | |
selected_data2 = selected_data2.sort_values(by='probability') | |
# Display relevant articles | |
articles = selected_data2['title'] | |
links = selected_data2['link'] | |
nb_art = min(4, len(links)) | |
articles_ret = """## Most relevant articles | |
""" | |
for i in range(nb_art): | |
ind = articles.index[i] | |
articles_ret += f""" * [{articles[ind]}]({links[ind]}) | |
""" | |
# Generate word cloud for keywords | |
keywords = selected_data['keywords'][selected_data.index[0]] | |
freq = selected_data["scores"][selected_data.index[0]] | |
keywords_wordcloud = dict() | |
for i, elem in enumerate(keywords): | |
keywords_wordcloud[elem] = freq[i] | |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(keywords_wordcloud) | |
fig, ax = plt.subplots() | |
plt.axis("off") | |
ax =plt.imshow(wordcloud, interpolation='bilinear') | |
return articles_ret , fig | |
# Define Gradio interface | |
iface = gr.Interface( | |
fn=display_topics, | |
inputs=gr.Dropdown(nt, label="Topic"), | |
outputs=[gr.Markdown(label="Most relevant articles"),gr.Plot(label="Main Keywords")], | |
live=True, | |
examples=[] | |
) | |
# Launch the app | |
iface.launch() | |