import openai | |
import gradio as gr | |
from gradio.components import Audio, Textbox | |
import os | |
import re | |
import tiktoken | |
from transformers import GPT2Tokenizer | |
import whisper | |
import pandas as pd | |
from datetime import datetime, timezone, timedelta | |
import notion_df | |
import concurrent.futures | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
nltk.download('punkt') | |
import spacy | |
from spacy import displacy | |
from gradio import Markdown | |
import threading | |
# Define the tokenizer and model | |
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') | |
model = openai.api_key = os.environ["OPENAI_API_KEY"] | |
# Define the initial message and messages list | |
initialt = 'You are a Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response.' | |
initial_message = {"role": "system", "content": initialt} | |
messages = [initial_message] | |
messages_rev = [initial_message] | |
# Define the answer counter | |
answer_count = 0 | |
# Define the Notion API key | |
API_KEY = os.environ["API_KEY"] | |
nlp = spacy.load("en_core_web_sm") | |
def process_nlp(system_message): | |
# Colorize the system message text | |
colorized_text = colorize_text(system_message['content']) | |
return colorized_text | |
from colour import Color | |
# # define color combinations for different parts of speech | |
# COLORS = { | |
# "NOUN": "#000000", # Black | |
# "VERB": "#ff6936", # Orange | |
# "ADJ": "#4363d8", # Blue | |
# "ADV": "#228b22", # Green | |
# "digit": "#9a45d6", # Purple | |
# "punct": "#ffcc00", # Yellow | |
# "quote": "#b300b3" # Magenta | |
# } | |
# # define color combinations for individuals with dyslexia and color vision deficiencies | |
# DYSLEXIA_COLORS = { | |
# "NOUN": "#000000", | |
# "VERB": "#ff6936", | |
# "ADJ": "#4363d8", | |
# "ADV": "#228b22", | |
# "digit": "#9a45d6", | |
# "punct": "#ffcc00", | |
# "quote": "#b300b3", | |
# } | |
# RED_GREEN_COLORS = { | |
# "NOUN": "#000000", | |
# "VERB": "#fe642e", # Lighter orange | |
# "ADJ": "#2e86c1", # Lighter blue | |
# "ADV": "#82e0aa", # Lighter green | |
# "digit": "#aa6c39", # Brown | |
# "punct": "#f0b27a", # Lighter yellow | |
# "quote": "#9932cc" # Darker magenta | |
# } | |
# # define a muted background color | |
# BACKGROUND_COLOR = "#ffffff" # White | |
# # define font and size | |
# FONT = "OpenDyslexic" | |
# FONT_SIZE = "18px" | |
# def colorize_text(text, colors=DYSLEXIA_COLORS, background_color=None, font=FONT, font_size=FONT_SIZE): | |
# if colors is None: | |
# colors = COLORS | |
# colorized_text = "" | |
# lines = text.split("\n") | |
# # set background color | |
# if background_color is None: | |
# background_color = BACKGROUND_COLOR | |
# # iterate over the lines in the text | |
# for line in lines: | |
# # parse the line with the language model | |
# doc = nlp(line) | |
# # iterate over the tokens in the line | |
# for token in doc: | |
# # check if the token is an entity | |
# if token.ent_type_: | |
# # use dyslexia colors for entity if available | |
# if colors == COLORS: | |
# color = DYSLEXIA_COLORS.get(token.pos_, None) | |
# else: | |
# color = colors.get(token.pos_, None) | |
# # check if a color is available for the token | |
# if color is not None: | |
# colorized_text += ( | |
# f'<span style="color: {color}; ' | |
# f'background-color: {background_color}; ' | |
# f'font-family: {font}; ' | |
# f'font-size: {font_size}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' | |
# f"{token.text}</span>" | |
# ) | |
# else: | |
# colorized_text += ( | |
# f'<span style="font-family: {font}; ' | |
# f'font-size: {font_size}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' | |
# f"{token.text}</span>" | |
# ) | |
# else: | |
# # check if a color is available for the token | |
# color = colors.get(token.pos_, None) | |
# if color is not None: | |
# colorized_text += ( | |
# f'<span style="color: {color}; ' | |
# f'background-color: {background_color}; ' | |
# f'font-family: {font}; ' | |
# f'font-size: {font_size}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' | |
# f"{token.text}</span>" | |
# ) | |
# elif token.is_digit: | |
# colorized_text += ( | |
# f'<span style="color: {colors["digit"]}; ' | |
# f'background-color: {background_color}; ' | |
# f'font-family: {font}; ' | |
# f'font-size: {font_size}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' | |
# f"{token.text}</span>" | |
# ) | |
# elif token.is_punct: | |
# colorized_text += ( | |
# f'<span style="color: {colors["punct"]}; ' | |
# f'background-color: {background_color}; ' | |
# f'font-family: {font}; ' | |
# f'font-size: {font_size}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' | |
# f"{token.text}</span>" | |
# ) | |
# elif token.is_quote: | |
# colorized_text += ( | |
# f'<span style="color: {colors["quote"]}; ' | |
# f'background-color: {background_color}; ' | |
# f'font-family: {font}; ' | |
# f'font-size: {font_size}; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' | |
# f"{token.text}</span>" | |
# ) | |
# else: | |
# # use larger font size for specific parts of speech, such as nouns and verbs | |
# font_size = FONT_SIZE | |
# if token.pos_ in ["NOUN", "VERB"]: | |
# font_size = "22px" | |
# colorized_text += ( | |
# f'<span style="font-family: {font}; ' | |
# f'font-size: {font_size}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' | |
# f"{token.text}</span>" | |
# ) | |
# colorized_text += "<br>" | |
# return colorized_text | |
# # define color combinations for different parts of speech | |
# COLORS = { | |
# "NOUN": "#5e5e5e", # Dark gray | |
# "VERB": "#ff6936", # Orange | |
# "ADJ": "#4363d8", # Blue | |
# "ADV": "#228b22", # Green | |
# "digit": "#9a45d6", # Purple | |
# "punct": "#ffcc00", # Yellow | |
# "quote": "#b300b3" # Magenta | |
# } | |
# # define color combinations for individuals with dyslexia | |
# DYSLEXIA_COLORS = { | |
# "NOUN": "#5e5e5e", | |
# "VERB": "#ff6936", | |
# "ADJ": "#4363d8", | |
# "ADV": "#228b22", | |
# "digit": "#9a45d6", | |
# "punct": "#ffcc00", | |
# "quote": "#b300b3" | |
# } | |
# # define a muted background color | |
# BACKGROUND_COLOR = "#f5f5f5" # Light gray | |
# # define font and size | |
# FONT = "Arial" | |
# FONT_SIZE = "14px" | |
# # load the English language model | |
# nlp = spacy.load('en_core_web_sm') | |
# def colorize_text(text, colors=DYSLEXIA_COLORS, background_color=None): | |
# if colors is None: | |
# colors = COLORS | |
# colorized_text = "" | |
# lines = text.split("\n") | |
# # set background color | |
# if background_color is None: | |
# background_color = BACKGROUND_COLOR | |
# # iterate over the lines in the text | |
# for line in lines: | |
# # parse the line with the language model | |
# doc = nlp(line) | |
# # iterate over the tokens in the line | |
# for token in doc: | |
# # check if the token is an entity | |
# if token.ent_type_: | |
# # use dyslexia colors for entity if available | |
# if colors == COLORS: | |
# color = DYSLEXIA_COLORS.get(token.pos_, None) | |
# else: | |
# color = colors.get(token.pos_, None) | |
# # check if a color is available for the token | |
# if color is not None: | |
# colorized_text += ( | |
# f'<span style="color: {color}; ' | |
# f'background-color: {background_color}; ' | |
# f'font-family: {FONT}; ' | |
# f'font-size: {FONT_SIZE}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' # Add space between tokens | |
# f"{token.text}</span>" | |
# ) | |
# else: | |
# colorized_text += ( | |
# f'<span style="font-family: {FONT}; ' | |
# f'font-size: {FONT_SIZE}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' # Add space between tokens | |
# f"{token.text}</span>" | |
# ) | |
# else: | |
# # check if a color is available for the token | |
# color = colors.get(token.pos_, None) | |
# if color is not None: | |
# colorized_text += ( | |
# f'<span style="color: {color}; ' | |
# f'background-color: {background_color}; ' | |
# f'font-family: {FONT}; ' | |
# f'font-size: {FONT_SIZE}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' # Add space between tokens | |
# f"{token.text}</span>" | |
# ) | |
# elif token.is_digit: | |
# colorized_text += ( | |
# f'<span style="color: {colors["digit"]}; ' | |
# f'background-color: {background_color}; ' | |
# f'font-family: {FONT}; ' | |
# f'font-size: {FONT_SIZE}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' # Add space between tokens | |
# f"{token.text}</span>" | |
# ) | |
# elif token.is_punct: | |
# colorized_text += ( | |
# f'<span style="color: {colors["punct"]}; ' | |
# f'background-color: {background_color}; ' | |
# f'font-family: {FONT}; ' | |
# f'font-size: {FONT_SIZE}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' # Add space between tokens | |
# f"{token.text}</span>" | |
# ) | |
# elif token.is_quote: | |
# colorized_text += ( | |
# f'<span style="color: {colors["quote"]}; ' | |
# f'background-color: {background_color}; ' | |
# f'font-family: {FONT}; ' | |
# f'font-size: {FONT_SIZE}; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' # Add space between tokens | |
# f"{token.text}</span>" | |
# ) | |
# else: | |
# colorized_text += ( | |
# f'<span style="font-family: {FONT}; ' | |
# f'font-size: {FONT_SIZE}; ' | |
# f'font-weight: bold; ' | |
# f'text-decoration: none; ' | |
# f'padding-right: 0.5em;">' # Add space between tokens | |
# f"{token.text}</span>" | |
# ) | |
# colorized_text += "<br>" | |
# return colorized_text | |
# define color combinations for different parts of speech | |
COLORS = { | |
"NOUN": "#FF3300", | |
"VERB": "#008000", | |
"ADJ": "#1E90FF", | |
"ADV": "#FF8C00", | |
"digit": "#FF1493", | |
"punct": "#8B0000", | |
"quote": "#800080", | |
} | |
# define color combinations for individuals with dyslexia | |
DYSLEXIA_COLORS = { | |
"NOUN": "#1E90FF", | |
"VERB": "#006400", | |
"ADJ": "#00CED1", | |
"ADV": "#FF8C00", | |
"digit": "#FF1493", | |
"punct": "#A0522D", | |
"quote": "#800080", | |
} | |
# define a muted background color | |
BACKGROUND_COLOR = "#EAEAEA" | |
# define font and size | |
FONT = "Georgia" | |
FONT_SIZE = "18px" | |
def colorize_text(text, colors=None, background_color=None): | |
if colors is None: | |
colors = COLORS | |
colorized_text = "" | |
lines = text.split("\n") | |
# set background color | |
if background_color is None: | |
background_color = BACKGROUND_COLOR | |
for line in lines: | |
doc = nlp(line) | |
for token in doc: | |
if token.ent_type_: | |
# use dyslexia colors for entity if available | |
if colors == COLORS: | |
color = DYSLEXIA_COLORS.get(token.pos_, None) | |
else: | |
color = colors.get(token.pos_, None) | |
if color is not None: | |
colorized_text += ( | |
f'<span style="color: {color}; ' | |
f'background-color: {background_color}; ' | |
f'font-family: {FONT}; ' | |
f'font-size: {FONT_SIZE}; ' | |
f'text-decoration: underline;">' | |
f"{token.text}</span>" | |
) | |
else: | |
colorized_text += ( | |
f'<span style="font-family: {FONT}; ' | |
f'font-size: {FONT_SIZE}; ' | |
f'text-decoration: underline;">' | |
f"{token.text}</span>" | |
) | |
else: | |
color = colors.get(token.pos_, None) | |
if color is not None: | |
colorized_text += ( | |
f'<span style="color: {color}; ' | |
f'background-color: {background_color}; ' | |
f'font-family: {FONT}; ' | |
f'font-size: {FONT_SIZE}; ' | |
f'text-decoration: underline;">' | |
f"{token.text}</span>" | |
) | |
elif token.is_digit: | |
colorized_text += ( | |
f'<span style="color: {colors["digit"]}; ' | |
f'background-color: {background_color}; ' | |
f'font-family: {FONT}; ' | |
f'font-size: {FONT_SIZE}; ' | |
f'text-decoration: underline;">' | |
f"{token.text}</span>" | |
) | |
elif token.is_punct: | |
colorized_text += ( | |
f'<span style="color: {colors["punct"]}; ' | |
f'background-color: {background_color}; ' | |
f'font-family: {FONT}; ' | |
f'font-size: {FONT_SIZE}; ' | |
f'text-decoration: underline;">' | |
f"{token.text}</span>" | |
) | |
elif token.is_quote: | |
colorized_text += ( | |
f'<span style="color: {colors["quote"]}; ' | |
f'background-color: {background_color}; ' | |
f'font-family: {FONT}; ' | |
f'font-size: {FONT_SIZE}; ' | |
f'text-decoration: underline;">' | |
f"{token.text}</span>" | |
) | |
else: | |
colorized_text += ( | |
f'<span style="font-family: {FONT}; ' | |
f'font-size: {FONT_SIZE}; ' | |
f'text-decoration: underline;">' | |
f"{token.text}</span>" | |
) | |
colorized_text += " " | |
colorized_text += "<br>" | |
return colorized_text | |
def colorize_and_update(system_message, submit_update): | |
colorized_system_message = colorize_text(system_message['content']) | |
submit_update(None, colorized_system_message) # Pass the colorized_system_message as the second output | |
def update_text_output(system_message, submit_update): | |
submit_update(system_message['content'], None) | |
def train(text): | |
now_et = datetime.now(timezone(timedelta(hours=-4))) | |
published_date = now_et.strftime('%m-%d-%y %H:%M') | |
df = pd.DataFrame([text]) | |
notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY) | |
def transcribe(audio, text, submit_update=None): | |
global messages | |
global answer_count | |
transcript = {'text': ''} | |
input_text = [] | |
# Check if the first word of the first line is "COLORIZE" | |
if text and text.split("\n")[0].split(" ")[0].strip().upper() == "COLORIZE": | |
train(text) | |
colorized_input = colorize_text(text) | |
return text, colorized_input | |
# Transcribe the audio if provided | |
if audio is not None: | |
audio_file = open(audio, "rb") | |
transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en") | |
# Tokenize the text input | |
if text is not None: | |
# Split the input text into sentences | |
sentences = re.split("(?<=[.!?]) +", text) | |
# Initialize a list to store the tokens | |
input_tokens = [] | |
# Add each sentence to the input_tokens list | |
for sentence in sentences: | |
# Tokenize the sentence using the GPT-2 tokenizer | |
sentence_tokens = tokenizer.encode(sentence) | |
# Check if adding the sentence would exceed the token limit | |
if len(input_tokens) + len(sentence_tokens) < 1440: | |
# Add the sentence tokens to the input_tokens list | |
input_tokens.extend(sentence_tokens) | |
else: | |
# If adding the sentence would exceed the token limit, truncate it | |
sentence_tokens = sentence_tokens[:1440-len(input_tokens)] | |
input_tokens.extend(sentence_tokens) | |
break | |
# Decode the input tokens into text | |
input_text = tokenizer.decode(input_tokens) | |
# Add the input text to the messages list | |
messages.append({"role": "user", "content": transcript["text"]+input_text}) | |
# Check if the accumulated tokens have exceeded 2096 | |
num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages) | |
if num_tokens > 2096: | |
# Concatenate the chat history | |
chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'system']) | |
# Append the number of tokens used to the end of the chat transcript | |
chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n" | |
# Get the current time in Eastern Time (ET) | |
now_et = datetime.now(timezone(timedelta(hours=-4))) | |
# Format the time as string (YY-MM-DD HH:MM) | |
published_date = now_et.strftime('%m-%d-%y %H:%M') | |
# Upload the chat transcript to Notion | |
df = pd.DataFrame([chat_transcript]) | |
notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date+'FULL'), api_key=API_KEY) | |
messages = [initial_message] | |
messages.append({"role": "user", "content": initialt}) | |
answer_count = 0 | |
# Add the input text to the messages list | |
messages.append({"role": "user", "content": input_text}) | |
else: | |
# Increment the answer counter | |
answer_count += 1 | |
# Generate the system message using the OpenAI API | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
prompt = [{"text": f"{message['role']}: {message['content']}\n\n"} for message in messages] | |
system_message = openai.ChatCompletion.create( | |
model="gpt-4", | |
messages=messages, | |
max_tokens=2000 | |
)["choices"][0]["message"] | |
# Wait for the completion of the OpenAI API call | |
if submit_update: # Check if submit_update is not None | |
update_text_output(system_message, submit_update) | |
# Add the system message to the messages list | |
messages.append(system_message) | |
# Add the system message to the beginning of the messages list | |
messages_rev.insert(0, system_message) | |
# Add the input text to the messages list | |
messages_rev.insert(0, {"role": "user", "content": input_text + transcript["text"]}) | |
# Start a separate thread to process the colorization and update the Gradio interface | |
if submit_update: # Check if submit_update is not None | |
colorize_thread = threading.Thread(target=colorize_and_update, args=(system_message, submit_update)) | |
colorize_thread.start() | |
# Concatenate the chat history | |
chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages_rev if message['role'] != 'system']) | |
# Append the number of tokens used to the end of the chat transcript | |
chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n" | |
# Save the chat transcript to a file | |
with open("conversation_history.txt", "a") as f: | |
f.write(chat_transcript) | |
# Upload the chat transcript to Notion | |
now_et = datetime.now(timezone(timedelta(hours=-4))) | |
published_date = now_et.strftime('%m-%d-%y %H:%M') | |
df = pd.DataFrame([chat_transcript]) | |
notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY) | |
# Return the chat transcript | |
return system_message['content'], colorize_text(system_message['content']) | |
# Define the input and output components for Gradio | |
audio_input = Audio(source="microphone", type="filepath", label="Record your message") | |
text_input = Textbox(label="Type your message", max_length=4096) | |
output_text = Textbox(label="Text Output") | |
output_html = Markdown() | |
output_audio = Audio() | |
# Define the Gradio interface | |
iface = gr.Interface( | |
fn=transcribe, | |
inputs=[audio_input, text_input], | |
outputs=[output_text, output_html], | |
title="Hold On, Pain Ends (HOPE)", | |
description="Talk to Your USMLE Tutor HOPE. \n If you want to colorize your note, type COLORIZE in the first line of your input.", | |
theme="compact", | |
layout="vertical", | |
allow_flagging=False | |
) | |
# Run the Gradio interface | |
iface.launch() |