|
import openai |
|
import gradio as gr |
|
from gradio.components import Audio, Textbox |
|
import os |
|
import re |
|
import tiktoken |
|
from transformers import GPT2Tokenizer |
|
import whisper |
|
import pandas as pd |
|
from datetime import datetime, timezone, timedelta |
|
import notion_df |
|
import concurrent.futures |
|
import nltk |
|
from nltk.tokenize import sent_tokenize |
|
nltk.download('punkt') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import openai |
|
import gradio as gr |
|
from gradio.components import Audio, Textbox |
|
import os |
|
import re |
|
import tiktoken |
|
from transformers import GPT2Tokenizer |
|
import whisper |
|
import pandas as pd |
|
from datetime import datetime, timezone, timedelta |
|
import notion_df |
|
import concurrent.futures |
|
import nltk |
|
from nltk.tokenize import sent_tokenize |
|
nltk.download('punkt') |
|
import spacy |
|
from spacy import displacy |
|
from gradio import Markdown |
|
import threading |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') |
|
model = openai.api_key = os.environ["OPENAI_API_KEY"] |
|
|
|
|
|
initmessage = 'You are a MCAT Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response.' |
|
initial_message = {"role": "system", "content": 'You are a MCAT Tutor. Pay especially attention to "testable" or "exam," or any related terms in the input and highlight them as "EXAM TOPIC." Respond ALWAYS quiz me with high yield and relevant qustions on the input and the answers layed out with layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. Expand on each point with great detail lists not sentence.'} |
|
|
|
messages = [initial_message] |
|
messages_rev = [initial_message] |
|
|
|
|
|
answer_count = 0 |
|
|
|
|
|
API_KEY = os.environ["API_KEY"] |
|
|
|
|
|
answer_count = 0 |
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
def process_nlp(system_message): |
|
|
|
colorized_text = colorize_text(system_message['content']) |
|
return colorized_text |
|
|
|
def colorize_text(text): |
|
colorized_text = "" |
|
lines = text.split("\n") |
|
|
|
for line in lines: |
|
doc = nlp(line) |
|
for token in doc: |
|
if token.ent_type_: |
|
colorized_text += f'**{token.text_with_ws}**' |
|
elif token.pos_ == 'NOUN': |
|
colorized_text += f'<span style="color: #FF3300; background-color: transparent;">{token.text_with_ws}</span>' |
|
elif token.pos_ == 'VERB': |
|
colorized_text += f'<span style="color: #FFFF00; background-color: transparent;">{token.text_with_ws}</span>' |
|
elif token.pos_ == 'ADJ': |
|
colorized_text += f'<span style="color: #00CC00; background-color: transparent;">{token.text_with_ws}</span>' |
|
elif token.pos_ == 'ADV': |
|
colorized_text += f'<span style="color: #FF6600; background-color: transparent;">{token.text_with_ws}</span>' |
|
elif token.is_digit: |
|
colorized_text += f'<span style="color: #9900CC; background-color: transparent;">{token.text_with_ws}</span>' |
|
elif token.is_punct: |
|
colorized_text += f'<span style="color: #8B4513; background-color: transparent;">{token.text_with_ws}</span>' |
|
elif token.is_quote: |
|
colorized_text += f'<span style="color: #008080; background-color: transparent;">{token.text_with_ws}</span>' |
|
else: |
|
colorized_text += token.text_with_ws |
|
colorized_text += "<br>" |
|
|
|
return colorized_text |
|
|
|
|
|
def colorize_and_update(system_message, submit_update): |
|
colorized_system_message = colorize_text(system_message['content']) |
|
submit_update(None, colorized_system_message) |
|
|
|
def update_text_output(system_message, submit_update): |
|
submit_update(system_message['content'], None) |
|
|
|
|
|
def transcribe(audio, text, submit_update=None): |
|
|
|
global messages |
|
global answer_count |
|
transcript = {'text': ''} |
|
input_text = [] |
|
|
|
if audio is not None: |
|
audio_file = open(audio, "rb") |
|
transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en") |
|
|
|
|
|
if text is not None: |
|
|
|
sentences = re.split("(?<=[.!?]) +", text) |
|
|
|
|
|
input_tokens = [] |
|
|
|
|
|
for sentence in sentences: |
|
|
|
sentence_tokens = tokenizer.encode(sentence) |
|
|
|
if len(input_tokens) + len(sentence_tokens) < 1440: |
|
|
|
input_tokens.extend(sentence_tokens) |
|
else: |
|
|
|
sentence_tokens = sentence_tokens[:1440-len(input_tokens)] |
|
input_tokens.extend(sentence_tokens) |
|
break |
|
|
|
input_text = tokenizer.decode(input_tokens) |
|
|
|
|
|
messages.append({"role": "user", "content": transcript["text"]+input_text}) |
|
|
|
|
|
|
|
num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages) |
|
if num_tokens > 2096: |
|
|
|
chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'system']) |
|
|
|
|
|
chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n" |
|
|
|
|
|
now_et = datetime.now(timezone(timedelta(hours=-4))) |
|
|
|
published_date = now_et.strftime('%m-%d-%y %H:%M') |
|
|
|
|
|
df = pd.DataFrame([chat_transcript]) |
|
notion_df.upload(df, 'https://www.notion.so/YENA-be569d0a40c940e7b6e0679318215790?pvs=4', title=str(published_date+'back_up'), api_key=API_KEY) |
|
|
|
|
|
messages = [initial_message] |
|
messages.append({"role": "user", "content": initmessage}) |
|
answer_count = 0 |
|
|
|
messages.append({"role": "user", "content": input_text}) |
|
else: |
|
|
|
answer_count += 1 |
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
prompt = [{"text": f"{message['role']}: {message['content']}\n\n"} for message in messages] |
|
system_message = openai.ChatCompletion.create( |
|
model="gpt-3.5-turbo", |
|
messages=messages, |
|
max_tokens=2000 |
|
)["choices"][0]["message"] |
|
|
|
|
|
if submit_update: |
|
update_text_output(system_message, submit_update) |
|
|
|
|
|
messages.append(system_message) |
|
|
|
|
|
messages_rev.insert(0, system_message) |
|
|
|
messages_rev.insert(0, {"role": "user", "content": input_text + transcript["text"]}) |
|
|
|
|
|
if submit_update: |
|
colorize_thread = threading.Thread(target=colorize_and_update, args=(system_message, submit_update)) |
|
colorize_thread.start() |
|
|
|
|
|
|
|
chat_transcript = system_message['content'] |
|
|
|
|
|
|
|
|
|
|
|
now_et = datetime.now(timezone(timedelta(hours=-4))) |
|
|
|
published_date = now_et.strftime('%m-%d-%y %H:%M') |
|
|
|
|
|
df = pd.DataFrame([chat_transcript]) |
|
notion_df.upload(df, 'https://www.notion.so/YENA-be569d0a40c940e7b6e0679318215790?pvs=4', title=str(published_date+'back_up'), api_key=API_KEY) |
|
|
|
return system_message['content'], colorize_text(system_message['content']) |
|
|
|
|
|
|
|
|
|
audio_input = Audio(source="microphone", type="filepath", label="Record your message") |
|
text_input = Textbox(label="Type your message", max_length=4096) |
|
|
|
output_text = Textbox(label="Text Output") |
|
output_html = Markdown() |
|
|
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=[audio_input, text_input], |
|
outputs=[output_text, output_html], |
|
title="Hold On, Pain Ends (HOPE)", |
|
description="Talk to Your USMLE Tutor HOPE", |
|
theme="compact", |
|
layout="vertical", |
|
allow_flagging=False |
|
) |
|
|
|
|
|
|
|
iface.launch() |