Spaces:

quantaji
/

document2slide-demo

Sleeping

App Files Files Community

quantaji commited on May 23, 2023

Commit

21a2300

•

1 Parent(s): f1370da

add post-process file

Browse files

Files changed (11) hide show

Dockerfile +4 -0
app.py +111 -15
core/chatgpt/dialogue_1.txt +72 -0
core/chatgpt/dialogue_2.txt +58 -0
core/chatgpt/dialogue_3.txt +66 -0
core/chatgpt/dialogue_4.txt +70 -0
core/chatgpt/generate_slides.py +178 -0
core/chatgpt/utils.py +220 -0
core/tei.py +9 -7
requirements.txt +1 -0
shiny_example_dockerfile +0 -13

Dockerfile CHANGED Viewed

@@ -34,6 +34,10 @@ RUN python core/init_sbt.py
 # add app
 ADD ./app.py /project/app.py
 EXPOSE 7860
 # add code
 ADD ./core/ /project/core/

 # add app
 ADD ./app.py /project/app.py
 EXPOSE 7860
+# creat log dir for grobid
+RUN mkdir /opt/grobid/logs
+# downlaod en_core_web_sm
+RUN python -m spacy download en_core_web_sm
 # add code
 ADD ./core/ /project/core/

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from subprocess import call
 from shiny import App, reactive, render, ui
 from core.read_pdf import process_pdf, temp_dir
 last_pdf_md5_preprocess_stage = None
@@ -29,12 +30,9 @@ def ui_card(title, *args):
 app_ui = ui.page_fluid(
     ui.h1("Document2Slide Demo"),
     ui_card(
-        "Upload PDF",
-        ui.input_file("input_pdf", "Choose a .pdf file to upload:", multiple=True),
         ui.output_text("upload_file_status", ),
-    ),
-    ui_card(
-        "Preprocess",
         ui.p(
             ui.input_action_button("preprocess_action", "Preprocess file", class_="btn-primary"),
             ui.output_text("preprocess_result", ),
@@ -42,17 +40,21 @@ app_ui = ui.page_fluid(
         ui.output_text("preprocess_status", ),
         ui.download_button("download_preprocessed", "Download preprocessed file"),
     ),
     ui_card(
-        "Download the bullet points in Markdown format.",
-        ui.download_button("download_bullet_point", "Download bullet point"),
-    ),
-    ui_card(
-        "Download the beamer source code `.tex` of the slide",
-        ui.download_button("download_beamer", "Download beamer source code"),
-    ),
-    ui_card(
-        "Download the PDF of slide.",
-        ui.download_button("download_slide", "Download slide generated"),
     ),
 )
@@ -129,5 +131,99 @@ def server(input, output, session):
             call(args, cwd=temp_dir)
             return str(os.path.join(temp_dir, file_name + '.zip'))
 app = App(app_ui, server)

 from shiny import App, reactive, render, ui
 from core.read_pdf import process_pdf, temp_dir
+from core.chatgpt.utils import generate_latex_slide
 last_pdf_md5_preprocess_stage = None
 app_ui = ui.page_fluid(
     ui.h1("Document2Slide Demo"),
     ui_card(
+        "Upload PDF and Preprocess",
+        ui.input_file("input_pdf", "Choose a .pdf file to upload:", multiple=False),
         ui.output_text("upload_file_status", ),
         ui.p(
             ui.input_action_button("preprocess_action", "Preprocess file", class_="btn-primary"),
             ui.output_text("preprocess_result", ),
         ui.output_text("preprocess_status", ),
         ui.download_button("download_preprocessed", "Download preprocessed file"),
     ),
+    ui.h3("Due to gpt-4's unreliable service, we choose to show our demo locally. You can refer to ./core/chatgpt/generate_slides.py for this pipeline."),
     ui_card(
+        "Upload the generated bullet points in pre-defined format.",
+        ui.input_file("input_bullet", "Choose a .tex bullet-point file to upload:", multiple=False),
+        ui.output_text("upload_bullet_status", ),
+        ui.p(
+            ui.input_action_button("process_bullet", "Generate .tex", class_="btn-primary"),
+            ui.output_text("process_bullet_result", ),
+        ),
+        ui.p(ui.download_button("download_beamer", "Download beamer source code")),
+        ui.p(
+            ui.input_action_button("complie_latex", "Compile the latex file generated before.", class_="btn-primary"),
+            ui.output_text("complie_latex_result", ),
+        ),
+        ui.p(ui.download_button("download_slide", "Download slide generated")),
     ),
 )
             call(args, cwd=temp_dir)
             return str(os.path.join(temp_dir, file_name + '.zip'))
+    def upload_file_status():
+        file_infos = input.input_bullet()
+        # print(file_infos) # [{'name': 'Poster.pdf', 'size': 598394, 'type': 'application/pdf', 'datapath': '/tmp/fileupload-2c21fv0a/tmpi91sy07h/0.pdf'}]
+        if not file_infos:
+            return "There is no file provided currently."
+        elif file_infos[0]['type'] != 'text/plain':
+            return "the file you provide is not in txt format, upload another one!"
+        else:
+            return "txt file successfully uploaded!"
+    @output
+    @render.text
+    @reactive.event(input.process_bullet)  # Take a dependency on the button
+    async def process_bullet_result():
+        file_infos = input.input_bullet()
+        file_name = file_infos[0]['name'] if file_infos else None
+        if (file_infos is not None) and file_infos[0]['type'] == 'text/plain':
+            txt_pth = file_infos[0]['datapath']
+            try:
+                with open(txt_pth, 'r') as f:
+                    slide = f.read()
+                output_tex_pth = str(os.path.join(temp_dir, file_name[:-4] + '.tex'))
+                if not os.path.exists(temp_dir):
+                    os.makedirs(temp_dir)
+                generate_latex_slide(slide, output_tex_pth)
+                return "Generate .tex file successful!"
+            except:
+                return "Something run happened please which to another file!"
+        else:
+            return "No .txt provided, please upload one!"
+    @session.download()
+    def download_beamer():
+        file_infos = input.input_bullet()
+        if not file_infos:
+            return
+        file_name = file_infos[0]['name']
+        tex_pth = str(os.path.join(temp_dir, file_name[:-4] + '.tex'))
+        if not os.path.exists(tex_pth):
+            return
+        else:
+            return tex_pth
+    @output
+    @render.text
+    @reactive.event(input.complie_latex)  # Take a dependency on the button
+    async def complie_latex_result():
+        file_infos = input.input_bullet()
+        if not file_infos:
+            return "No file uploaded yet!"
+        file_name = file_infos[0]['name']
+        tex_pth = str(os.path.join(temp_dir, file_name[:-4] + '.tex'))
+        if not os.path.exists(tex_pth):
+            return "No .tex file yet, please upload a .txt bullet point file and convert it to beamer tex."
+        tex_file_name = tex_pth.split('/')[-1]
+        args = ["latexmk", "-xelatex", tex_file_name]
+        return_code = call(args, cwd=temp_dir)
+        if return_code == 0:
+            return "Compile sucessful!"
+        else:
+            return "Compile fail!"
+    @session.download()
+    def download_slide():
+        file_infos = input.input_bullet()
+        if not file_infos:
+            return
+        file_name = file_infos[0]['name']
+        pdf_pth = str(os.path.join(temp_dir, file_name[:-4] + '.pdf'))
+        if not os.path.exists(pdf_pth):
+            return
+        else:
+            return pdf_pth
 app = App(app_ui, server)

core/chatgpt/dialogue_1.txt ADDED Viewed

	@@ -0,0 +1,72 @@

+[user]
+You will assist me in creating conference slides from a research paper.
+I will provide you with the slide deck format, title, table of contents, and abstract.
+Afterwards, I will provide you with the first section of the paper.
+In the future, I will provide you with the rest of the sections.
+Ensure the slide deck is clear, informative, and easy to follow.
+[assistant]
+I'm happy to help you create conference slides from your research paper.
+To get started, please share the slide deck format with me.
+[user]
+The slides you'll create should have the following format:
+\n\n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
+\n- Use [PE] tag to denote the end of a slide.
+\n\nFor the slide content:
+\n- The slide content will be formatted as a list of bullet points.
+\n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
+\n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
+\n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
+\n- You can include multiple [T] and [T][T] points on each page, but try to keep it to a reasonable amount to avoid clutter.
+\n- 3-4 bullet points (lines beginning with [T]) per page is a good rule of thumb!!!
+\n\nImportant!!!:
+\n- Avoid creating slides that are overloaded with text.
+\n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
+\n\nIn general slides will look as follows:
+\n[PB] Title of Page 1
+\n[T] Point 1
+\n[T][T] Subpoint 1
+\n[T][T] Subpoint 2
+\n[T] Point 2
+\n[T] Point 3
+\n[PE]
+\n\n[PB] Title of Page 2
+\n[T] Point 4
+\n[T][T] Subpoint 3
+\n[T] Point 5
+\n[T] Point 6
+\n[T][T] Subpoint 4
+\n[PE]
+\n\n... # more pages to follow
+[assistant]
+Great! Please provide me with the title, table of contents, and abstract of your research paper.
+Once I have that information, we can proceed with creating the slides together.
+[user]
+[data_tag_0]
+[assistant]
+Now that I have the title, abstract, and table of contents, please provide the first section of the paper,
+[data_tag_1].
+I'll create a clear and informative slide deck following the specified format.
+[user]
+Please adhere to the slide deck format and avoid overcrowding slides with excessive text.
+Consider creating additional slides if necessary.
+\nHere is the first section of the paper:
+\n\n[data_tag_2]

core/chatgpt/dialogue_2.txt ADDED Viewed

	@@ -0,0 +1,58 @@

+[user]
+You are assisting me in creating conference slides from a research paper.
+I will provide you with the slide deck format, and the current section of the paper.
+Then, you will create informative slides from it.
+[assistant]
+Understood! Please provide me with the slide deck format to follow.
+[user]
+The slides you'll create should have the following format:
+\n\n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
+\n- Use [PE] tag to denote the end of a slide.
+\n\nFor the slide content:
+\n- The slide content will be formatted as a list of bullet points.
+\n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
+\n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
+\n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
+\n- Do not include [T][T][T] or more levels of indentation.
+\n\nImportant!!!:
+\n- Avoid creating slides that are overloaded with text.
+\n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
+\n\nIn general slides will look as follows:
+\n[PB] Title of Page 1
+\n[T] Point 1
+\n[T][T] Subpoint 1
+\n[T][T] Subpoint 2
+\n[T] Point 2
+\n[T] Point 3
+\n[PE]
+\n\n[PB] Title of Page 2
+\n[T] Point 4
+\n[T][T] Subpoint 3
+\n[T] Point 5
+\n[T] Point 6
+\n[T][T] Subpoint 4
+\n[PE]
+\n\n... # more pages to follow
+[assistant]
+Great! Please provide me with the current section of the research paper,
+I'll create a clear and informative slide deck following the specified format.
+[user]
+Please adhere to the slide deck format and avoid overcrowding slides with excessive text. Consider creating multiple slides if necessary. \nHere is the current section of the paper:
+\n\n[data_tag_0]

core/chatgpt/dialogue_3.txt ADDED Viewed

	@@ -0,0 +1,66 @@

+# there are 4 tags possible
+# user, assistant, system and data_tag_i
+[user]
+You are assisting me in creating conference slides from a research paper.
+I will provide you with the slide deck format, and the current draft of the slides.
+Your task is to create clear, concise and informative slides from the draft.
+Current draft contains redundant information, and you should remove it.
+Slides must be interesting, engaging, and informative.
+Modify the slides as you see fit, and make sure that they follow rules and formats described below.
+\n\nRules:
+\n- Slides may be too long, and you should split them into multiple slides.
+\n- Slides may not be coherent, and you should rephrase them to make them more coherent and informative.
+\n- Slides should have a flow, and you should make sure that the flow is preserved.
+\n- There should not be slides with very little text.
+\n- Make slides clear and concise, and avoid overcrowding them with excessive text.
+[assistant]
+Understood! Please provide me with the slide deck format to follow.
+[user]
+The slides you'll create should have the following format:
+\n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
+\n- Use [PE] tag to denote the end of a slide.
+\n\nFor the slide content:
+\n- The slide content will be formatted as a list of bullet points.
+\n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
+\n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
+\n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
+\n- Do not include [T][T][T] or more levels of indentation.
+\n\nImportant!!!:
+\n- Avoid creating slides that are overloaded with text.
+\n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
+\n\nIn general slides will look as follows:
+\n[PB] Title of Page 1
+\n[T] Point 1
+\n[T][T] Subpoint 1
+\n[T][T] Subpoint 2
+\n[T] Point 2
+\n[T] Point 3
+\n[PE]
+\n\n[PB] Title of Page 2
+\n[T] Point 4
+\n[T][T] Subpoint 3
+\n[T] Point 5
+\n[T] Point 6
+\n[T][T] Subpoint 4
+\n[PE]
+\n\n... # more pages to follow
+[assistant]
+Great! Please provide me with the current draft of the slides.
+[user]
+Here is the current draft of the slides:
+\n\n[data_tag_0]

core/chatgpt/dialogue_4.txt ADDED Viewed

	@@ -0,0 +1,70 @@

+# the following dialogue is for GPT-4 model
+# there are 4 tags possible
+# user, assistant, system and data_tag_i
+[user]
+You are assisting me in creating conference slides from a research paper.
+I will provide you with the slide deck format, and the current draft of the slides.
+Your task is to create clear, informative slides from the draft.
+Slides must be interesting, engaging, and informative.
+Modify the slides as you see fit, and make sure that they follow rules and formats described below.
+\n\nRules:
+\n- Slides may be too long, and you should split them into multiple slides.
+\n- Slides may not be coherent, and you should rephrase them to make them more coherent and informative.
+\n- Slides should have a flow, and you should make sure that the flow is preserved.
+\n- There should not be slides with very little text.
+\n- Make slides clear and concise, and avoid overcrowding them with excessive text.
+[assistant]
+Understood! Please provide me with the slide deck format to follow.
+[user]
+The slides you'll create should have the following format:
+\n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
+\n- Use [PE] tag to denote the end of a slide.
+\n\nFor the slide content:
+\n- The slide content will be formatted as a list of bullet points.
+\n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
+\n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
+\n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
+\n- Do not include [T][T][T] or more levels of indentation.
+\n\nImportant!!!:
+\n- Avoid creating slides that are overloaded with text.
+\n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
+\n\nIn general slides will look as follows:
+\n[PB] Title of Page 1
+\n[T] Point 1
+\n[T][T] Subpoint 1
+\n[T][T] Subpoint 2
+\n[T] Point 2
+\n[T] Point 3
+\n[PE]
+/..   /
+/.'
+.'
+.'
+\n\n[PB] Title of Page 2
+\n[T] Point 4
+\n[T][T] Subpoint 3
+\n[T] Point 5
+\n[T] Point 6
+\n[T][T] Subpoint 4
+\n[PE]
+\n\n... # more pages to follow
+[assistant]
+Great! Please provide me with the current draft of the slides.
+[user]
+Here is the current draft of the slides:
+\n\n[data_tag_0]

core/chatgpt/generate_slides.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import json
+import os
+import time
+import numpy as np
+import openai
+import pandas as pd
+import spacy
+import tqdm
+from tqdm import tqdm
+from .utils import get_num_tokens, parse_prompt, num_tokens_from_messages, clean_slides, slide_generation_ver2, generate_latex_slide
+nlp = spacy.load('en_core_web_sm')
+def set_openai_api_key(key: str):
+    openai.api_key = 'key'
+def generate_slide(json_pth: str):
+    model_list = [model['id'] for model in openai.Model.list()['data']]
+    gpt4_id = "gpt-4-0314"
+    gpt3_id = 'gpt-3.5-turbo-0301'
+    with open(json_pth) as f:
+        data = json.load(f)
+    title = data['title']
+    abstract = data['abstract']
+    paper_length = len(data['text'])
+    sections = [[head['section'], ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))])] for head in data['headers']]
+    figures = [fig['caption'] for fig in data['figures']]
+    ### ! Split the sections by chunks with token_limit
+    new_sections = []
+    toc = ""
+    token_limit = 1400
+    for section in sections:
+        section_title = section[0]
+        curr_count = get_num_tokens(section[1])
+        toc += section_title + "; "
+        if curr_count > token_limit:
+            # split the section into sentences
+            sents = nlp(section[1]).sents
+            temp_list = []
+            for sent in sents:
+                if not temp_list:
+                    temp_list.append(sent.text)
+                    continue
+                curr_count = get_num_tokens(temp_list[-1])
+                if curr_count + get_num_tokens(sent.text) < token_limit:
+                    temp_list[-1] += sent.text
+                else:
+                    temp_list.append(sent.text)
+            for i in range(len(temp_list)):
+                if i == 0:
+                    new_sections.append([section_title, temp_list[i]])
+                else:
+                    new_sections.append([section_title + " (cont.)", temp_list[i]])
+        else:
+            new_sections.append(section)
+    print(f"Total number of sections: {len(new_sections)}")
+    # ! get the initial message
+    initial_user_message = "Title: " + title + "\nTable of Contents: " + toc + "\nAbstract: " + abstract
+    initial_section_title = new_sections[0][0]
+    initial_section_content = new_sections[0][1]
+    # ! initial dialogue, Generates slides for the first section of the research paper.
+    res = []
+    data = [initial_user_message, initial_section_title, initial_section_content]
+    messages = parse_prompt("./dialogue_1.txt", data)
+    token_length = num_tokens_from_messages(messages)
+    assert token_length < 2400, f"Message is too long: {token_length}"
+    response = openai.ChatCompletion.create(
+        model=gpt3_id,
+        messages=messages,
+        temperature=0.5,
+    )
+    answer = response["choices"][0]["message"]["content"]
+    res.append(answer)
+    time.sleep(10)
+    ### ! Following dialogue. Generates slides for the following sections of the research paper.
+    for i, (section_title, section_content) in enumerate(new_sections[1:]):
+        print(f"Section {i+1}: {section_title} is being processed...")
+        data = [section_content]
+        messages = parse_prompt("./dialogue_2.txt", data)
+        token_length = num_tokens_from_messages(messages)
+        assert token_length < 2400, f"Message is too long: {token_length}"
+        response = openai.ChatCompletion.create(
+            model=gpt3_id,
+            messages=messages,
+            temperature=0.9,
+        )
+        answer = response["choices"][0]["message"]["content"]
+        res.append(answer)
+        del messages, token_length, response, answer
+        time.sleep(10)  # sleep for 10 seconds to avoid API limit
+    ### ! Clean slides from comments, empty lines and other garbage
+    for i in range(len(res)):
+        res[i] = clean_slides(res[i])
+    temp_res = res
+    prev_cnt = len(temp_res)
+    while len(temp_res) > 1:
+        temp_num_tokens = get_num_tokens("\n".join(temp_res))
+        temp_res = slide_generation_ver2(temp_res, 1800)
+        print(f"The length of res is {len(temp_res)}, and the number of tokens is {temp_num_tokens}")
+        # if the number of slides is not changed then break
+        if len(temp_res) == prev_cnt:
+            break
+        else:
+            prev_cnt = len(temp_res)
+        # if the number of tokens is less than 4000 then break
+        if temp_num_tokens <= 4000:
+            break
+        new_res = []
+        for i in tqdm(range(len(temp_res))):
+            data = [temp_res[i]]
+            messages = parse_prompt("./dialogue_3.txt", data)
+            token_length = num_tokens_from_messages(messages)
+            assert token_length < 2400, f"Message is too long: {token_length}"
+            response = openai.ChatCompletion.create(
+                model=gpt3_id,
+                messages=messages,
+                temperature=0.9,
+            )
+            temp = response["choices"][0]["message"]["content"]
+            temp = clean_slides(temp)
+            new_res.append(temp)
+            time.sleep(5)  # needed to avoid API limit
+        temp_res = new_res
+        time.sleep(10)  # needed to avoid API limit
+    # ! final refinement
+    final_draft = "\n".join(temp_res)
+    data = [final_draft]
+    messages = parse_prompt("./dialogue_4.txt", data)
+    print(num_tokens_from_messages(messages))
+    response = openai.ChatCompletion.create(
+        model=gpt4_id if gpt4_id in model_list else gpt3_id,
+        messages=messages,
+        temperature=0.5,
+    )
+    temp = response["choices"][0]["message"]["content"]
+    # generate_latex_slide(temp, "test.tex")
+    return temp

core/chatgpt/utils.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import tiktoken
+from typing import Dict, Tuple, List
+def slide_generation(res, num_tokens_limit=1800):
+    new_res = [res[0]]
+    for i in range(1, len(res)):
+        if not res[i]:
+            continue
+        prev_cnt = get_num_tokens(new_res[-1])
+        curr_cnt = get_num_tokens(res[i])
+        if prev_cnt + curr_cnt < num_tokens_limit:
+            new_res[-1] += res[i]
+        else:
+            new_res.append(res[i])
+    return new_res
+def slide_generation_ver2(res, num_tokens_limit=1800):
+    text = "\n".join(res).split("[PE]")
+    text = [(t.strip() + "\n[PE]\n") if t else "" for t in text]
+    return slide_generation(text, num_tokens_limit=num_tokens_limit)
+def parse_prompt(file: str, data: List[str] = None):
+    roles = []
+    contents = []
+    file = open(file, "r")
+    for line in file.readlines():
+        # if line is empty or a comment, skip
+        if "#" in line or not line.strip():
+            continue
+        if "[user]" in line:
+            roles.append("user")
+            contents.append([])
+            continue
+        elif "[assistant]" in line:
+            roles.append("assistant")
+            contents.append([])
+            continue
+        elif "[system]" in line:
+            roles.append("system")
+            contents.append([])
+            continue
+        if line.strip():
+            assert roles, "No role specified"
+            contents[-1].append(line.strip())
+    # checking roles
+    assert roles[0] in ["user", "system"], "First role must be user or system"
+    for i in range(1, len(roles)):
+        assert roles[i] in ["user", "assistant"], "Roles must be user or assistant"
+        assert roles[i] != roles[i - 1], "Roles must alternate between user and assistant"
+    contents_str = []
+    for content in contents:
+        contents_str.append(" ".join(content))
+    curr_idx = 0
+    for i in range(len(contents_str)):
+        tag = f"[data_tag_{curr_idx}]"
+        # replace \n with newline
+        contents_str[i] = contents_str[i].replace("\\n", "\n")
+        if tag in contents_str[i]:
+            contents_str[i] = contents_str[i].replace(tag, data[curr_idx])
+            curr_idx += 1
+    assert curr_idx == len(data), "Not all data tags were replaced"
+    messages = []
+    for i in range(len(roles)):
+        messages.append({"role": roles[i], "content": contents_str[i]})
+    return messages
+def clean_slides(slide):
+    slide_list = slide.split('\n')
+    clean_slide_list = []
+    for line in slide_list:
+        if line[:3] == '[F]' or line[:3] == '[T]' or line[:6] == '[T][T]' or line[:4] == '[PB]' or line[:4] == '[PE]':
+            clean_slide_list.append(line)
+    return '\n'.join(clean_slide_list)
+def generate_latex_slide(slide, output_path=None):
+    # Initialize the Beamer document
+    latex_code = "\\documentclass{beamer} \n\\begin{document}"
+    # Split the slide string into pages
+    pages = slide.split('[PB]')[1:]
+    # Iterate through each page
+    for i, page in enumerate(pages):
+        tmp_list = [None, None]  # [title, content]
+        page = page.strip()
+        print(i, page)
+        # Extract the page title and content
+        title_end_index = page.index("\n") + 1
+        title = page[:title_end_index].strip()
+        content_end_index = page.index("[PE]")
+        content = page[title_end_index:content_end_index].strip()
+        # Start a new frame with the page title
+        if title:
+            tmp_list[0] = f"\n\\begin{{frame}}{{{title}}}\n\n"
+        # Split the content into list items
+        items = content.split('\n')
+        p = []
+        for item in items:
+            if not item:
+                break
+            # print(item)
+            if '[T][T]' in item:
+                assert len(p) > 0, "Subpoint cannot be the first item in a page"
+                subpoints = item.split('[T][T]')[1]
+                p[-1].append(subpoints)
+            else:
+                if '[T]' in item:
+                    point = item.split('[T]')[1]
+                else:
+                    point = item
+                p.append([point])
+        if p:
+            # Add each item as a Beamer itemize element
+            tmp_list[1] = "\\begin{itemize}\n"
+            for point in p:
+                if not point:
+                    break
+                tmp_list[1] += f"\\item {point[0]}\n"
+                if len(point) > 1:
+                    tmp_list[1] += "\\begin{itemize}\n"
+                    for subpoint in point[1:]:
+                        tmp_list[1] += f"\\item {subpoint}\n"
+                    tmp_list[1] += "\\end{itemize}\n"
+            tmp_list[1] += "\\end{itemize}\n"
+        if tmp_list[0] is None and tmp_list[1] is None:
+            # The page is empty, so skip it
+            if i == len(pages) - 1:
+                # This is the last page, so end the document instead of the frame
+                latex_code += "\n\\end{document}"
+            break
+        tmp_list[1] += "\n\\end{frame}\n"
+        # End the frame
+        if i == len(pages) - 1:
+            # This is the last page, so end the document instead of the frame
+            tmp_list[1] += "\n\\end{document}"
+        latex_code += "".join(tmp_list)
+    latex_code = latex_code.replace('_', '\_').replace('&', '\&').replace('^', '\^').replace('$', '\$')
+    if output_path:
+        with open(output_path, 'w') as f:
+            f.write(latex_code)
+def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
+    """
+    Returns the number of tokens required to encode the given messages.
+    source: https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/chatgpt?pivots=programming-language-chat-completions#managing-conversations
+    """
+    encoding = tiktoken.encoding_for_model(model)
+    num_tokens = 0
+    for message in messages:
+        num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
+        for key, value in message.items():
+            num_tokens += len(encoding.encode(value))
+            if key == "name":  # if there's a name, the role is omitted
+                num_tokens += -1  # role is always required and always 1 token
+    num_tokens += 2  # every reply is primed with <im_start>assistant
+    return num_tokens
+def get_num_tokens(message, model="gpt-3.5-turbo-0301"):
+    encoding = tiktoken.encoding_for_model(model)
+    num_tokens = 0
+    num_tokens += len(encoding.encode(message))
+    return num_tokens
+def get_paper_text_in_chunks(example, chunk_size=4000):
+    paper_length = len(example['paper']['text'])
+    title = '[TB] ' + example['title'] + ' [TE] '
+    abstract = '[AB] ' + example['paper']['abstract'] + ' [AE] '
+    sections = [' [SB] ' + head['n'] + ' ' + head['section'] + ' [SC] ' + ' '.join([example['paper']['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]) + ' [SE] ' for head in example['paper']['headers']]
+    figures = [' [FB] ' + fig['caption'] + ' [FE] ' for fig in example['paper']['figures']]
+    chunks = []
+    temp_chunk = title + abstract
+    temp_chunk_length = get_num_tokens(temp_chunk)
+    for s in sections + figures:
+        assert get_num_tokens(s) < chunk_size, "Section or figure is too long to fit in a chunk"
+        if temp_chunk_length + get_num_tokens(s) > chunk_size:
+            chunks.append(temp_chunk)
+            temp_chunk = s
+            temp_chunk_length = get_num_tokens(s)
+        else:
+            temp_chunk += s
+            temp_chunk_length += get_num_tokens(s)
+    if temp_chunk_length > 0:
+        chunks.append(temp_chunk)
+    return chunks

core/tei.py CHANGED Viewed

@@ -99,13 +99,15 @@ class TEIFile(object):
                 if head.parent.name == 'div':
                     txt = head.parent.get_text(separator=' ', strip=True)
                     # the following is only valid for arabic numerals...
-                    if head.get("n"):
-                        sections.append([head.text, head.get('n'), txt])
-                    else:
-                        if len(sections) == 0:
-                            print("Grobid processing error.")
-                        sections[-1][2] += txt
-                    # sections.append([head.text, 'invalid n', txt])
             start = 0
             for i in sections:
                 sent = nltk.tokenize.sent_tokenize(i[2])

                 if head.parent.name == 'div':
                     txt = head.parent.get_text(separator=' ', strip=True)
                     # the following is only valid for arabic numerals...
+                    try:
+                        if head.get("n"):
+                            sections.append([head.text, head.get('n'), txt])
+                        else:
+                            if len(sections) == 0:
+                                print("Grobid processing error.")
+                            sections[-1][2] += txt
+                    except:
+                        sections.append([head.text, 'invalid n', txt])
             start = 0
             for i in sections:
                 sent = nltk.tokenize.sent_tokenize(i[2])

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ lxml
 # for interaction with openai
 openai
 tiktoken
 # for shiny
 anyio==3.6.2
 appdirs==1.4.4

 # for interaction with openai
 openai
 tiktoken
+spacy
 # for shiny
 anyio==3.6.2
 appdirs==1.4.4

shiny_example_dockerfile DELETED Viewed

@@ -1,13 +0,0 @@
-FROM python:3.9
-WORKDIR /code
-COPY ./requirements.txt /code/requirements.txt
-RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
-COPY . .
-EXPOSE 7860
-CMD ["shiny", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]