Spaces:

lordvader31
/

almithal

Sleeping

App Files Files Community

lordvader31 commited on May 2, 2023

Commit

79b94f8

•

1 Parent(s): 499c2f5

major update from bitbucket

Browse files

Files changed (9) hide show

app.py +366 -0
classifier.py +0 -0
keywords.py +28 -0
mindmap.py +50 -0
models.py +93 -0
prompts/mindmap.prompt +11 -0
summary.py +55 -0
takeaways.py +51 -0
transcription.py +302 -0

app.py ADDED Viewed

	@@ -0,0 +1,366 @@

+# Streamlit classes
+import streamlit as st
+from streamlit_agraph import agraph, Node, Edge, Config
+from streamlit_chat import message
+# Data manipulation and embeddings
+import pandas as pd
+import numpy as np
+import openai
+from openai.embeddings_utils import distances_from_embeddings
+import whisper
+# Exec tasks
+import os, json
+import math
+import re
+# Custom classes
+from transcription import *
+from keywords import Keywords
+from summary import TextSummarizer
+from takeaways import KeyTakeaways
+from mindmap import MindMap
+import models as md
+REGEXP_YOUTUBE_URL = "^(https?\:\/\/)?((www\.)?youtube\.com|youtu\.be)\/.+$"
+model = whisper.load_model('base')
+output = ''
+data = []
+data_transcription = {"title":"", "text":""}
+embeddings = []
+text_chunks_lib = dict()
+user_input = None
+tldr = ""
+summary = ""
+takeaways = []
+folder_name = "./tests"
+input_accepted = False
+is_completed_analysis = False
+def get_initial_message():
+    messages=[
+            {"role": "system", "content": "You are a helpful AI Tutor. Who anwers brief questions about AI."},
+            {"role": "user", "content": "I want to learn AI"},
+            {"role": "assistant", "content": "Thats awesome, what do you want to know aboout AI"}
+        ]
+    return messages
+nodes = []
+edges = []
+nodes.append( Node(id="Spiderman",
+                   label="Peter Parker",
+                   size=25,
+                   shape="circularImage",
+                   image="http://marvel-force-chart.surge.sh/marvel_force_chart_img/top_spiderman.png")
+            ) # includes **kwargs
+nodes.append( Node(id="Captain_Marvel",
+                   size=25,
+                   shape="circularImage",
+                   image="http://marvel-force-chart.surge.sh/marvel_force_chart_img/top_captainmarvel.png")
+            )
+edges.append( Edge(source="Captain_Marvel",
+                   label="friend_of",
+                   target="Spiderman",
+                   )
+            )
+config = Config(width=750,
+                height=950,
+                directed=True,
+                physics=True,
+                hierarchical=False,
+                )
+user_secret = os.getenv("OPENAI_API_KEY")
+# Define the purpose of the application
+st.header('Almithal')
+st.subheader('Almithal is a comprehensive video and PDF study buddy.')
+st.write('It provides a summary, transcription, key insights, a mind map and a Q&A feature where you can actually "talk" to the datasource.')
+bar = st.progress(0)
+# =========== SIDEBAR FOR GENERATION ===========
+with st.sidebar:
+    youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
+    st.markdown("OR")
+    pdf_file = st.file_uploader("Upload your PDF", type="pdf")
+    st.markdown("OR")
+    audio_file = st.file_uploader("Upload your MP3 audio file", type=["wav", "mp3"])
+    gen_keywords = st.radio(
+        "Generate keywords from text?",
+        ('Yes', 'No')
+    )
+    gen_summary = st.radio(
+        "Generate summary from text? (recommended for label matching below, but will take longer)",
+        ('Yes', 'No')
+    )
+    if st.button("Start Analysis"):
+        # Check if it is a valid youtube URL
+        if re.search(REGEXP_YOUTUBE_URL, youtube_link):
+            vte = VideoTranscription(youtube_link)
+            YOUTUBE_VIDEO_ID = youtube_link.split("=")[1]
+            folder_name = f"./tests/{YOUTUBE_VIDEO_ID}"
+            if not os.path.exists(folder_name):
+                os.mkdir(folder_name)
+            with st.spinner('Running process...'):
+                data_transcription = vte.transcribe()
+                segments = data_transcription['segments']
+            with open(f"{folder_name}/data.json", "w") as f:
+                json.dump(data_transcription, f, indent=4)
+        # PDF Transcription
+        elif pdf_file is not None:
+            pte = PDFTranscription(pdf_file)
+            folder_name = pte.get_redacted_name()
+            if not os.path.exists(folder_name):
+                os.mkdir(folder_name)
+            with st.spinner('Running process...'):
+                data_transcription = pte.transcribe()
+                segments = data_transcription['segments']
+        # Audio transcription
+        elif audio_file is not None:
+            ate = AudioTranscription(audio_file)
+            folder_name = ate.get_redacted_name()
+            if not os.path.exists(f""):
+                os.mkdir(folder_name)
+            with st.spinner('Running process...'):
+                data_transcription = ate.transcribe()
+                segments = data_transcription['segments']
+            with open(f"{folder_name}/data.json", "w") as f:
+                json.dump(data_transcription, f, indent=4)
+        else:
+            st.error("Please type in your youtube link or upload the PDF")
+            st.experimental_rerun()
+        # Save the transcript information
+        with open(f"{folder_name}/data_transcription.json", "w") as f:
+            json.dump(data_transcription, f, indent=4)
+        # Generate embeddings
+        if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
+            for i, segment in enumerate(segments):
+                bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
+                response = openai.Embedding.create(
+                    input= segment["text"].strip(),
+                    model="text-embedding-ada-002"
+                )
+                embeddings = response['data'][0]['embedding']
+                meta = {
+                    "text": segment["text"].strip(),
+                    "embedding": embeddings
+                }
+                data.append(meta)
+            pd.DataFrame(data).to_csv(f'{folder_name}/word_embeddings.csv')
+        else:
+            data = pd.read_csv(f'{folder_name}/word_embeddings.csv')
+            embeddings = data["embedding"]
+        bar.progress(75)
+        text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
+        input_accepted = True
+        with st.spinner('Breaking up the text and doing analysis...'):
+            # For each body of text, create text chunks of a certain token size required for the transformer
+            title_entry = text_df['title'][0]
+            print(title_entry)
+            for i in range(0, len(text_df)):
+                nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
+                # For each chunk of sentences (within the token max)
+                text_chunks = []
+                for n in range(0, len(nested_sentences)):
+                    tc = " ".join(map(str, nested_sentences[n]))
+                    text_chunks.append(tc)
+                text_chunks_lib[title_entry] = text_chunks
+            # Generate key takeaways
+            key_engine = Keywords(title_entry)
+            keywords = key_engine.get_keywords(text_chunks_lib)
+        # Generate the summary
+        if gen_summary == 'Yes':
+            se = TextSummarizer(title_entry)
+            text_transcription = data_transcription['text']
+            with st.spinner("Generating summary and TLDR..."):
+                summary = se.generate_full_summary(text_chunks_lib)
+                summary_list = summary.split("\n\n")
+                tldr = se.generate_short_summary(summary_list)
+        # Generate key takeaways
+        kt = KeyTakeaways()
+        with st.spinner("Generating key takeaways ... "):
+            takeaways = kt.generate_key_takeaways(text_chunks_lib)
+        is_completed_analysis = True
+        bar.progress(100)
+if is_completed_analysis:
+    st.header("Key Takeaways")
+    st.write("Here are some of the key takeaways from the data:")
+    for takeaway in takeaways:
+        st.markdown(f"- {takeaway}")
+tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["Introduction", "Summary", "Transcription", "Mind Map", "Keywords", "Q&A"])
+# =========== INTRODUCTION ===========
+with tab1:
+    st.subheader("Introduction")
+    st.markdown("## How do I use this?")
+    st.markdown("Do one of the following")
+    st.markdown('* Type in your youtube URL that you want worked on')
+    st.markdown('* Place the PDF file that you want worked on')
+    st.markdown("**Once the file / url has finished saving, a 'Start Analysis' button will appear. Click on this button to begin the note generation**")
+    st.warning("NOTE: This is just a demo product in alpha testing. Any and all bugs will soon be fixed")
+    st.warning("After the note taking is done, you will see multiple tabs for more information")
+# =========== SUMMARIZATION ===========
+with tab2:
+    if is_completed_analysis:
+        st.header("TL;DR")
+        for point in tldr:
+            st.markdown(f"- {point}")
+        st.header("Summary")
+        st.write(summary)
+    else:
+        st.warning("Please wait for the analysis to finish")
+# =========== TRANSCRIPTION ===========
+with tab3:
+    st.header("Transcription")
+    if is_completed_analysis:
+        with st.spinner("Generating transcript ..."):
+            st.write("")
+            for text in text_chunks_lib[title_entry]:
+                st.write(text)
+    else:
+        st.warning("Please wait for the analysis to finish")
+# =========== MIND MAP ===========
+with tab4:
+    st.header("Mind Map")
+    if is_completed_analysis:
+        mindmap = MindMap()
+        with st.spinner("Generating mind map..."):
+            mindmap.generate_graph(text_chunks_lib)
+    else:
+        st.warning("Please wait for the analysis to finish")
+# =========== KEYWORDS ===========
+with tab5:
+    st.header("Keywords:")
+    if is_completed_analysis and gen_keywords:
+        for i, keyword in enumerate(keywords):
+            st.markdown(f"{i+1}. {keyword}")
+    else:
+        st.warning("Please wait for the analysis to finish")
+# =========== QUERY BOT ===========
+with tab6:
+    if 'generated' not in st.session_state:
+        st.session_state['generated'] = []
+    if 'past' not in st.session_state:
+        st.session_state['past'] = []
+    def get_text():
+        st.header("Ask me something about the video:")
+        input_text = st.text_input("You: ", key="prompt")
+        return input_text
+    def get_embedding_text(prompt):
+        response = openai.Embedding.create(
+            input= prompt.strip(),
+            model="text-embedding-ada-002"
+        )
+        q_embedding = response['data'][0]['embedding']
+        print("the folder name at got here 1.5 is ", folder_name)
+        df = pd.read_csv(f'{folder_name}/word_embeddings.csv', index_col=0)
+        df['embedding'] = df['embedding'].apply(eval).apply(np.array)
+        df['distances'] = distances_from_embeddings(q_embedding, df['embedding'].values, distance_metric='cosine')
+        returns = []
+        # Sort by distance with 2 hints
+        for i, row in df.sort_values('distances', ascending=True).head(4).iterrows():
+            # Else add it to the text that is being returned
+            returns.append(row["text"])
+        # Return the context
+        return "\n\n###\n\n".join(returns)
+    def generate_response(prompt):
+        one_shot_prompt = '''
+            I am YoutubeGPT, a highly intelligent question answering bot.
+            If you ask me a question that is rooted in truth, I will give you the answer.
+            Q: What is human life expectancy in the United States?
+            A: Human life expectancy in the United States is 78 years.
+            Q: '''+prompt+'''
+            A:
+        '''
+        completions = openai.Completion.create(
+            engine = "text-davinci-003",
+            prompt = one_shot_prompt,
+            max_tokens = 1024,
+            n = 1,
+            stop=["Q:"],
+            temperature=0.5,
+        )
+        message = completions.choices[0].text
+        return message
+    if is_completed_analysis:
+        user_input = get_text()
+        print("user input is ", user_input)
+        print("the folder name at got here 0.5 is ", folder_name)
+    else:
+        user_input = None
+    if 'messages' not in st.session_state:
+        st.session_state['messages'] = get_initial_message()
+    if user_input:
+        print("got here 1")
+        print("the folder name at got here 1.5 is ", folder_name)
+        text_embedding = get_embedding_text(user_input)
+        print("the folder name at got here 1.5 is ", folder_name)
+        print("got here 2")
+        with open(f'{folder_name}/data_transcription.json', "r") as f:
+            title = json.load(f)['title']
+        string_title = "\n\n###\n\n".join(title)
+        user_input_embedding = 'Using this context: "'+string_title+'. '+text_embedding+'", answer the following question. \n'+user_input
+        print("got here 3")
+        output = generate_response(user_input_embedding)
+        st.session_state.past.append(user_input)
+        st.session_state.generated.append(output)
+    if st.session_state['generated']:
+        for i in range(len(st.session_state['generated'])-1, -1, -1):
+            message(st.session_state["generated"][i], key=str(i))
+            message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
+# st.header("What else")

classifier.py ADDED Viewed

File without changes

keywords.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import models as md
+import pandas as pd
+class Keywords:
+    def __init__(self, title_element:str):
+        self.title_element = []
+        self.kw_model = md.load_keyword_model()
+    def get_keywords(self, text_chunks_lib:dict) -> list:
+        kw_dict = dict()
+        text_chunk_counter = 0
+        for key in text_chunks_lib:
+            keywords_list = []
+            for text_chunk in text_chunks_lib[key]:
+                text_chunk_counter += 1
+                keywords_list += md.keyword_gen(self.kw_model, text_chunk)
+                kw_dict[key] = dict(keywords_list)
+        # Display as a dataframe
+        kw_df0 = pd.DataFrame.from_dict(kw_dict).reset_index()
+        kw_df0.rename(columns={'index': 'keyword'}, inplace=True)
+        kw_df = pd.melt(kw_df0, id_vars=['keyword'], var_name='title', value_name='score').dropna()
+        kw_column_list = ['keyword', 'score']
+        kw_df = kw_df[kw_df['score'] > 0.25][kw_column_list].sort_values(['score'], ascending=False).reset_index().drop(columns='index')
+        return kw_df['keyword']

mindmap.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import openai
+import json
+import graphviz
+import streamlit as st
+class MindMap:
+    def __init__(self):
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+    def get_connections(self, text_chunks_libs:dict) -> list:
+        state_prompt = open("./prompts/mindmap.prompt")
+        PROMPT = state_prompt.read()
+        state_prompt.close()
+        final_connections = []
+        for key in text_chunks_libs:
+            for text_chunk in text_chunks_libs[key]:
+                PROMPT = PROMPT.replace("$prompt", text_chunk)
+                response = openai.Completion.create(
+                    engine="text-davinci-003",
+                    prompt = PROMPT,
+                    temperature=0.5,
+                    max_tokens=2048,
+                    top_p=1,
+                    frequency_penalty=0.0,
+                    presence_penalty=0.0,
+                )
+                relationships = response.choices[0].text
+                final_string = '{"relations":' + relationships + '}'
+                data = json.loads(final_string)
+                print(data)
+                relations = data["relations"]
+                final_connections.extend(relations)
+                print(final_connections)
+        return final_connections
+    def generate_graph(self, text_chunks_libs:dict):
+        graph = graphviz.Digraph()
+        all_connections = self.get_connections(text_chunks_libs)
+        for connection in all_connections:
+            from_node = connection[0]
+            to_node = connection[2]
+            graph.edge(from_node, to_node)
+        st.graphviz_chart(graph)

models.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, BartTokenizer, BartForConditionalGeneration
+import streamlit as st
+from keybert import KeyBERT
+import re
+def create_nest_sentences(document:str, token_max_length = 1024):
+  nested = []
+  sent = []
+  length = 0
+  tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
+  for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
+    tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
+    length += len(tokens_in_sentence)
+    if length < token_max_length:
+      sent.append(sentence)
+    else:
+      nested.append(sent)
+      sent = [sentence]
+      length = 0
+  if sent:
+    nested.append(sent)
+  return nested
+@st.cache_data
+def load_keyword_model():
+  kw_model = KeyBERT()
+  return kw_model
+def keyword_gen(kw_model, sequence:str):
+  keywords = kw_model.extract_keywords(sequence,
+    keyphrase_ngram_range=(1, 1),
+    stop_words='english',
+    use_mmr=True,
+    diversity=0.5,
+    top_n=10)
+  return keywords
+# Reference: https://huggingface.co/facebook/bart-large-mnli
+@st.cache_data
+def load_summary_model():
+    model_name = "facebook/bart-large-cnn"
+    summarizer = pipeline(task='summarization', model=model_name)
+    return summarizer
+def load_summary_model_large():
+    model_name = "facebook/bart-large-mnli"
+    tokenizer = BartTokenizer.from_pretrained(model_name)
+    model = BartForConditionalGeneration.from_pretrained(model_name)
+    summarizer = pipeline(task='summarization', model=model, tokenizer=tokenizer, framework='pt')
+    return summarizer
+def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:int):
+	output = summarizer(sequence,
+    num_beams=4,
+    length_penalty=2.0,
+    max_length=maximum_tokens,
+    min_length=minimum_tokens,
+    do_sample=False,
+    early_stopping = True,
+    no_repeat_ngram_size=3)
+	return output[0].get('summary_text')
+# # Reference: https://www.datatrigger.org/post/nlp_hugging_face/
+# # Custom summarization pipeline (to handle long articles)
+# def summarize(text, minimum_length_of_summary = 100):
+#     # Tokenize and truncate
+#     inputs = tokenizer([text], truncation=True, max_length=1024, return_tensors='pt').to('cuda')
+#     # Generate summary
+#     summary_ids = model_bart.generate(inputs['input_ids'], num_beams=4, min_length = minimum_length_of_summary, max_length=400, early_stopping=True)
+#     # Untokenize
+#     return([tokenizer_bart.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0])
+# Reference: https://huggingface.co/spaces/team-zero-shot-nli/zero-shot-nli/blob/main/utils.py
+@st.cache_data
+def load_model():
+    model_name = "facebook/bart-large-mnli"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    classifier = pipeline(task='zero-shot-classification', model=model, tokenizer=tokenizer, framework='pt')
+    return classifier
+def classifier_zero(classifier, sequence:str, labels:list, multi_class:bool):
+    outputs = classifier(sequence, labels, multi_label=multi_class)
+    return outputs['labels'], outputs['scores']

prompts/mindmap.prompt ADDED Viewed

	@@ -0,0 +1,11 @@

+Given a prompt, extrapolate as many relationships as possible from it and provide a list of updates.
+If an update is a relationship, provide [ENTITY 1, RELATIONSHIP, ENTITY 2]. The relationship is directed, so the order matters.
+Example:
+prompt: Alice is Bob's roommate. Bob is Charlie's friend.
+updates:
+[["Alice", "roommate", "Bob"], ["Bob", "friend", "Charlie"]]
+prompt: $prompt
+updates:

summary.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import models as md
+import nltk
+import openai
+import os
+nltk.download("punkt")
+class TextSummarizer:
+    def __init__(self, title):
+        self.title = title
+        self.model = "gpt-3.5-turbo"
+        self.summarizer = md.load_summary_model()
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+    def generate_short_summary(self, summary_chunks:dict) -> list:
+        PROMPT =  """
+            You are a helpful assistant that summarizes youtube videos.
+            Someone has already summarized the video to key points.
+            Summarize the key points in at most two sentences that capture the essence of the passage.
+        """
+        final_summary = []
+        for summary_chunk in summary_chunks:
+            response = openai.ChatCompletion.create(
+                model=self.model,
+                messages=[
+                        {"role": "system", "content": PROMPT},
+                        {"role": "user", "content": summary_chunk},
+                    ],
+            )
+            summary = response["choices"][0]["message"]["content"]
+            final_summary.append(summary)
+        return final_summary
+    def generate_full_summary(self, text_chunks_lib:dict) -> str:
+        sum_dict = dict()
+        for _, key in enumerate(text_chunks_lib):
+            # for key in text_chunks_lib:
+            summary = []
+            for _, text_chunk in enumerate(text_chunks_lib[key]):
+                chunk_summary = md.summarizer_gen(self.summarizer, sequence=text_chunk, maximum_tokens=500, minimum_tokens=100)
+                summary.append(chunk_summary)
+                # Combine all the summaries into a list and compress into one document, again
+                final_summary = "\n\n".join(list(summary))
+                sum_dict[key] = [final_summary]
+        return sum_dict[self.title][0]

takeaways.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import openai
+import os
+def extract_bullet_points(text):
+    """
+    Extract bullet points from a string and return a list of bullet points.
+    Args:
+        text (str): The input text containing bullet points.
+    Returns:
+        list: A list of bullet points.
+    """
+    bullet_points = []
+    lines = text.split("\n")
+    for line in lines:
+        # Check if the line starts with a bullet point (e.g. "1. ", "2. ", etc.)
+        if line.strip().startswith(("* ", "- ", "• ", "· ", "1. ", "2. ", "3. ", "4. ", "5. ", "6. ", "7. ", "8. ", "9. ")):
+            bullet_points.append(line.strip()[2:])
+    return bullet_points
+class KeyTakeaways:
+    def __init__(self):
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+    def generate_key_takeaways(self, text_chunks_lib:dict) -> list:
+        PROMPT = """
+            You are a super intelligent human and helpful assistant.
+            I am giving you parts of a video transcription that I want to learn from.
+            In bullet points, give me at most 3 key takeaways from this text.
+        """
+        final_takeaways = []
+        for key in text_chunks_lib:
+            for text_chunk in text_chunks_lib[key]:
+                response = openai.Completion.create(
+                    engine="text-davinci-003",
+                    prompt=PROMPT + text_chunk,
+                    temperature=0.4,
+                    max_tokens=1024,
+                    top_p=1,
+                    frequency_penalty=0.0,
+                    presence_penalty=0.6,
+                )
+                takeaways = extract_bullet_points(response.choices[0].text.strip())
+                final_takeaways.extend(takeaways)
+        return final_takeaways

transcription.py ADDED Viewed

	@@ -0,0 +1,302 @@

+# For downloading from youtube and transcribing audio
+from pytube import YouTube
+from moviepy.editor import *
+from pydub import AudioSegment
+from pydub.utils import make_chunks
+import pydub
+from pathlib import Path
+# For getting text from PDF
+from zipfile import ZipFile
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.converter import TextConverter
+from pdfminer.layout import LAParams
+from pdfminer.pdfpage import PDFPage
+from io import StringIO
+# For transcription
+import openai, whisper, torch
+from faster_whisper import WhisperModel
+import tiktoken
+from nltk import tokenize
+# For other stuff
+import os, re
+import time, math
+# USEFUL CONSTANTS
+# Duration is set to 6 minutes = 360 seconds = 360000 milliseconds
+DURATION = 360000
+# Maximum audio file size is 18MB
+MAX_FILE_SIZE_BYTES = 18000000
+# The model to use for transcription
+WHISPER_MODEL = "tiny"
+MODEL_SIZE = "base"
+class DownloadAudio:
+    """Downloads the audio from a youtube video and saves it to multiple .wav files in the specified folder"""
+    def __init__(self, link) -> None:
+        self.link = link
+        self.yt = YouTube(self.link)
+        self.YOUTUBE_VIDEO_ID = link.split("=")[1]
+        self.WAV_FILE_NAME = f"{self.YOUTUBE_VIDEO_ID}.wav"
+    def get_yt_title(self) -> str:
+        """Returns the title of the youtube video"""
+        while True:
+            try:
+                title = self.yt.title
+                return title
+            except:
+                print("Failed to get name. Retrying...")
+                time.sleep(1)
+                self.yt = YouTube(self.link)
+                continue
+    def download(self, pathname:str):
+        """
+        Download the audio from the youtube video and saves it to multiple .wav files
+        in the specified folder. Returns a list of the paths to the .wav files.
+        """
+        # Check if the folder for the VIDEO_ID exists
+        if not os.path.exists(pathname):
+            os.mkdir(pathname)
+        FINAL_WAV_PATH = f"{pathname}/{self.WAV_FILE_NAME}"
+        if not os.path.exists(FINAL_WAV_PATH):
+            # Download the .mp4 file
+            audiostream = self.yt.streams.filter(only_audio=True).first()
+            outfile_path = audiostream.download(pathname)
+            # Convert the .mp4 file to .wav
+            wav_file = AudioFileClip(outfile_path)
+            wav_file.write_audiofile(FINAL_WAV_PATH, bitrate="16k", fps=16000)
+        # Load the input .wav file
+        audio = AudioSegment.from_wav(FINAL_WAV_PATH)
+        # Get the duration of the input file in milliseconds
+        total_byte_size = os.path.getsize(FINAL_WAV_PATH)
+        # If the total duration is less than the duration of each segment,
+        # then just return the original file
+        if total_byte_size < MAX_FILE_SIZE_BYTES:
+            return FINAL_WAV_PATH
+        # Get the size of the wav file
+        channels = audio.channels
+        sample_width = audio.sample_width
+        duration_in_sec = math.ceil(len(audio) / 1000)
+        sample_rate = audio.frame_rate
+        bit_rate = sample_width * 8
+        wav_file_size = (sample_rate * bit_rate * channels * duration_in_sec) / 8
+        # Get the length of each chunk in milliseconds and make the chunks
+        chunk_length_in_sec = math.ceil((duration_in_sec * MAX_FILE_SIZE_BYTES ) / wav_file_size)   #in sec
+        chunk_length_ms = chunk_length_in_sec * 1000
+        chunks = make_chunks(audio, chunk_length_ms)
+        # Export all of the individual chunks as wav files
+        chunk_names = []
+        for i, chunk in enumerate(chunks):
+            chunk_name = f"{self.YOUTUBE_VIDEO_ID}_{i}.wav"
+            output_chunk_path = f"{pathname}/{chunk_name}"
+            chunk_names.append(output_chunk_path)
+            chunk.export(f"{output_chunk_path}", format="wav")
+        return FINAL_WAV_PATH
+class VideoTranscription:
+    """Performs transcription on a PDF or a link to a youtube video"""
+    def __init__(self, datalink) -> None:
+        self.datalink = datalink
+        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
+        self.model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
+        openai.api_key = os.environ.get("OPENAI_API_KEY")
+    def transcribe(self) -> dict:
+        """Returns the transcription of the PDF or youtube video as a string"""
+        start_time = time.time()
+        if self.datalink.startswith("http"):
+            transcript = self.get_text_from_link()
+        else:
+            transcript = self.get_text_from_pdf()
+        end_time = time.time()
+        print(f"transcription took {end_time - start_time} seconds")
+        return transcript
+    def get_text_from_link(self) -> dict:
+        # Get the names of the stored wav files
+        YOUTUBE_VIDEO_ID = self.datalink.split("=")[1]
+        FOLDER_NAME = f"./tests/{YOUTUBE_VIDEO_ID}"
+        # Get the audio file
+        audio_file = DownloadAudio(self.datalink)
+        # Get the names of the stored wav files
+        original_file_name = audio_file.download(FOLDER_NAME)
+        print(original_file_name)
+        # Get the transcription of each audio chunk
+        text_transcriptions = ""
+        # for file_name in file_names:
+        # Get the transcription
+        chunk_segments, _ = self.model.transcribe(original_file_name, beam_size=5)
+        for chunk_segment in chunk_segments:
+            text_transcriptions += chunk_segment.text.replace("$", "\$")
+        # Tokenize each sentence of the transcription.
+        sentences = tokenize.sent_tokenize(text_transcriptions)
+        segments = []
+        for i, sentence in enumerate(sentences):
+            segment = {
+                "id":i,
+                "text":sentence,
+                "tokens":self.encoding.encode(sentence)
+            }
+            segments.append(segment)
+        final_transcription = {
+            "title": audio_file.get_yt_title(),
+            "text": text_transcriptions,
+            "segments": segments
+        }
+        return final_transcription
+class AudioTranscription:
+    """Performs transcription on a MP3 file"""
+    def __init__(self, audio_file) -> None:
+        self.file = audio_file
+        self.title = self.file.name
+        self.folder_name = f"./tests/{self.title}".replace(' ', '')
+        self.folder_name = self.folder_name[:self.folder_name.rindex('.')]
+        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
+        self.model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
+        openai.api_key = os.environ.get("OPENAI_API_KEY")
+    def get_redacted_name(self):
+        return self.folder_name
+    def transcribe(self) -> dict:
+        """Returns the transcription of the MP3 audio as a string"""
+        start_time = time.time()
+        if not os.path.exists(self.folder_name):
+            os.mkdir(self.folder_name)
+        if self.title.endswith('wav'):
+            audio = pydub.AudioSegment.from_wav(self.file)
+            file_type = 'wav'
+        elif self.title.endswith('mp3'):
+            audio = pydub.AudioSegment.from_mp3(self.file)
+            file_type = 'mp3'
+        save_path = Path(self.folder_name) / self.file.name
+        audio.export(save_path, format=file_type)
+        final_wav_path = save_path
+        if file_type == 'mp3':
+            sound = AudioSegment.from_mp3(save_path)
+            final_wav_path = self.folder_name + "/" +  self.title[:-4]+'.wav'
+            sound.export(final_wav_path, format="wav")
+        chunk_segments, info = self.model.transcribe(final_wav_path, beam_size=5)
+        text_transcriptions = ""
+        for chunk_segment in chunk_segments:
+            text_transcriptions += chunk_segment.text.replace("$", "\$")
+        # Tokenize each sentence of the transcription.
+        sentences = tokenize.sent_tokenize(text_transcriptions)
+        segments = []
+        for i, sentence in enumerate(sentences):
+            segment = {
+                "id":i,
+                "text":sentence,
+                "tokens":self.encoding.encode(sentence)
+            }
+            segments.append(segment)
+        final_transcription = {
+            "title": self.title,
+            "text": text_transcriptions,
+            "segments": segments
+        }
+        end_time = time.time()
+        print(f"transcription took {end_time - start_time} seconds")
+        return final_transcription
+def convert_pdf_to_txt_pages(path):
+    texts = []
+    rsrcmgr = PDFResourceManager()
+    retstr = StringIO()
+    laparams = LAParams()
+    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    size = 0
+    c = 0
+    file_pages = PDFPage.get_pages(path)
+    nbPages = len(list(file_pages))
+    for page in PDFPage.get_pages(path):
+        interpreter.process_page(page)
+        t = retstr.getvalue()
+        if c == 0:
+            texts.append(t)
+        else:
+            texts.append(t[size:])
+        c = c + 1
+        size = len(t)
+    device.close()
+    retstr.close()
+    return texts, nbPages
+class PDFTranscription:
+    def __init__(self, pdf_file):
+        self.file = pdf_file
+        self.title = pdf_file.name
+        self.folder_name = f"./tests/{self.title}".replace(' ', '')
+        self.folder_name = self.folder_name[:self.folder_name.rindex('.')]
+        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
+    def get_redacted_name(self):
+        return self.folder_name
+    def transcribe(self):
+        text, nbpages = convert_pdf_to_txt_pages(self.file)
+        pdf_transcription = ''.join(text)
+        sentences = tokenize.sent_tokenize(pdf_transcription)
+        segments = []
+        for i, sentence in enumerate(sentences):
+            segment = {
+                "id":i,
+                "text":sentence,
+                "tokens":self.encoding.encode(sentence)
+            }
+            segments.append(segment)
+        final_transcription = {
+            "title":self.title,
+            "text":pdf_transcription,
+            "segments":segments
+        }
+        return final_transcription