quantaji commited on
Commit
21a2300
1 Parent(s): f1370da

add post-process file

Browse files
Dockerfile CHANGED
@@ -34,6 +34,10 @@ RUN python core/init_sbt.py
34
  # add app
35
  ADD ./app.py /project/app.py
36
  EXPOSE 7860
 
 
 
 
37
 
38
  # add code
39
  ADD ./core/ /project/core/
 
34
  # add app
35
  ADD ./app.py /project/app.py
36
  EXPOSE 7860
37
+ # creat log dir for grobid
38
+ RUN mkdir /opt/grobid/logs
39
+ # downlaod en_core_web_sm
40
+ RUN python -m spacy download en_core_web_sm
41
 
42
  # add code
43
  ADD ./core/ /project/core/
app.py CHANGED
@@ -5,6 +5,7 @@ from subprocess import call
5
  from shiny import App, reactive, render, ui
6
 
7
  from core.read_pdf import process_pdf, temp_dir
 
8
 
9
  last_pdf_md5_preprocess_stage = None
10
 
@@ -29,12 +30,9 @@ def ui_card(title, *args):
29
  app_ui = ui.page_fluid(
30
  ui.h1("Document2Slide Demo"),
31
  ui_card(
32
- "Upload PDF",
33
- ui.input_file("input_pdf", "Choose a .pdf file to upload:", multiple=True),
34
  ui.output_text("upload_file_status", ),
35
- ),
36
- ui_card(
37
- "Preprocess",
38
  ui.p(
39
  ui.input_action_button("preprocess_action", "Preprocess file", class_="btn-primary"),
40
  ui.output_text("preprocess_result", ),
@@ -42,17 +40,21 @@ app_ui = ui.page_fluid(
42
  ui.output_text("preprocess_status", ),
43
  ui.download_button("download_preprocessed", "Download preprocessed file"),
44
  ),
 
45
  ui_card(
46
- "Download the bullet points in Markdown format.",
47
- ui.download_button("download_bullet_point", "Download bullet point"),
48
- ),
49
- ui_card(
50
- "Download the beamer source code `.tex` of the slide",
51
- ui.download_button("download_beamer", "Download beamer source code"),
52
- ),
53
- ui_card(
54
- "Download the PDF of slide.",
55
- ui.download_button("download_slide", "Download slide generated"),
 
 
 
56
  ),
57
  )
58
 
@@ -129,5 +131,99 @@ def server(input, output, session):
129
  call(args, cwd=temp_dir)
130
  return str(os.path.join(temp_dir, file_name + '.zip'))
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  app = App(app_ui, server)
 
5
  from shiny import App, reactive, render, ui
6
 
7
  from core.read_pdf import process_pdf, temp_dir
8
+ from core.chatgpt.utils import generate_latex_slide
9
 
10
  last_pdf_md5_preprocess_stage = None
11
 
 
30
  app_ui = ui.page_fluid(
31
  ui.h1("Document2Slide Demo"),
32
  ui_card(
33
+ "Upload PDF and Preprocess",
34
+ ui.input_file("input_pdf", "Choose a .pdf file to upload:", multiple=False),
35
  ui.output_text("upload_file_status", ),
 
 
 
36
  ui.p(
37
  ui.input_action_button("preprocess_action", "Preprocess file", class_="btn-primary"),
38
  ui.output_text("preprocess_result", ),
 
40
  ui.output_text("preprocess_status", ),
41
  ui.download_button("download_preprocessed", "Download preprocessed file"),
42
  ),
43
+ ui.h3("Due to gpt-4's unreliable service, we choose to show our demo locally. You can refer to ./core/chatgpt/generate_slides.py for this pipeline."),
44
  ui_card(
45
+ "Upload the generated bullet points in pre-defined format.",
46
+ ui.input_file("input_bullet", "Choose a .tex bullet-point file to upload:", multiple=False),
47
+ ui.output_text("upload_bullet_status", ),
48
+ ui.p(
49
+ ui.input_action_button("process_bullet", "Generate .tex", class_="btn-primary"),
50
+ ui.output_text("process_bullet_result", ),
51
+ ),
52
+ ui.p(ui.download_button("download_beamer", "Download beamer source code")),
53
+ ui.p(
54
+ ui.input_action_button("complie_latex", "Compile the latex file generated before.", class_="btn-primary"),
55
+ ui.output_text("complie_latex_result", ),
56
+ ),
57
+ ui.p(ui.download_button("download_slide", "Download slide generated")),
58
  ),
59
  )
60
 
 
131
  call(args, cwd=temp_dir)
132
  return str(os.path.join(temp_dir, file_name + '.zip'))
133
 
134
+ def upload_file_status():
135
+ file_infos = input.input_bullet()
136
+ # print(file_infos) # [{'name': 'Poster.pdf', 'size': 598394, 'type': 'application/pdf', 'datapath': '/tmp/fileupload-2c21fv0a/tmpi91sy07h/0.pdf'}]
137
+ if not file_infos:
138
+ return "There is no file provided currently."
139
+ elif file_infos[0]['type'] != 'text/plain':
140
+ return "the file you provide is not in txt format, upload another one!"
141
+ else:
142
+ return "txt file successfully uploaded!"
143
+
144
+ @output
145
+ @render.text
146
+ @reactive.event(input.process_bullet) # Take a dependency on the button
147
+ async def process_bullet_result():
148
+
149
+ file_infos = input.input_bullet()
150
+ file_name = file_infos[0]['name'] if file_infos else None
151
+
152
+ if (file_infos is not None) and file_infos[0]['type'] == 'text/plain':
153
+
154
+ txt_pth = file_infos[0]['datapath']
155
+
156
+ try:
157
+ with open(txt_pth, 'r') as f:
158
+ slide = f.read()
159
+ output_tex_pth = str(os.path.join(temp_dir, file_name[:-4] + '.tex'))
160
+ if not os.path.exists(temp_dir):
161
+ os.makedirs(temp_dir)
162
+
163
+ generate_latex_slide(slide, output_tex_pth)
164
+
165
+ return "Generate .tex file successful!"
166
+ except:
167
+ return "Something run happened please which to another file!"
168
+ else:
169
+ return "No .txt provided, please upload one!"
170
+
171
+ @session.download()
172
+ def download_beamer():
173
+
174
+ file_infos = input.input_bullet()
175
+
176
+ if not file_infos:
177
+ return
178
+ file_name = file_infos[0]['name']
179
+ tex_pth = str(os.path.join(temp_dir, file_name[:-4] + '.tex'))
180
+
181
+ if not os.path.exists(tex_pth):
182
+ return
183
+ else:
184
+ return tex_pth
185
+
186
+ @output
187
+ @render.text
188
+ @reactive.event(input.complie_latex) # Take a dependency on the button
189
+ async def complie_latex_result():
190
+
191
+ file_infos = input.input_bullet()
192
+ if not file_infos:
193
+ return "No file uploaded yet!"
194
+
195
+ file_name = file_infos[0]['name']
196
+ tex_pth = str(os.path.join(temp_dir, file_name[:-4] + '.tex'))
197
+
198
+ if not os.path.exists(tex_pth):
199
+ return "No .tex file yet, please upload a .txt bullet point file and convert it to beamer tex."
200
+
201
+ tex_file_name = tex_pth.split('/')[-1]
202
+
203
+ args = ["latexmk", "-xelatex", tex_file_name]
204
+
205
+ return_code = call(args, cwd=temp_dir)
206
+
207
+ if return_code == 0:
208
+ return "Compile sucessful!"
209
+ else:
210
+ return "Compile fail!"
211
+
212
+ @session.download()
213
+ def download_slide():
214
+
215
+ file_infos = input.input_bullet()
216
+
217
+ if not file_infos:
218
+ return
219
+
220
+ file_name = file_infos[0]['name']
221
+ pdf_pth = str(os.path.join(temp_dir, file_name[:-4] + '.pdf'))
222
+
223
+ if not os.path.exists(pdf_pth):
224
+ return
225
+ else:
226
+ return pdf_pth
227
+
228
 
229
  app = App(app_ui, server)
core/chatgpt/dialogue_1.txt ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [user]
2
+
3
+ You will assist me in creating conference slides from a research paper.
4
+ I will provide you with the slide deck format, title, table of contents, and abstract.
5
+ Afterwards, I will provide you with the first section of the paper.
6
+ In the future, I will provide you with the rest of the sections.
7
+ Ensure the slide deck is clear, informative, and easy to follow.
8
+
9
+ [assistant]
10
+
11
+ I'm happy to help you create conference slides from your research paper.
12
+ To get started, please share the slide deck format with me.
13
+
14
+ [user]
15
+
16
+ The slides you'll create should have the following format:
17
+
18
+ \n\n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
19
+ \n- Use [PE] tag to denote the end of a slide.
20
+
21
+ \n\nFor the slide content:
22
+ \n- The slide content will be formatted as a list of bullet points.
23
+ \n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
24
+ \n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
25
+ \n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
26
+ \n- You can include multiple [T] and [T][T] points on each page, but try to keep it to a reasonable amount to avoid clutter.
27
+ \n- 3-4 bullet points (lines beginning with [T]) per page is a good rule of thumb!!!
28
+
29
+ \n\nImportant!!!:
30
+ \n- Avoid creating slides that are overloaded with text.
31
+ \n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
32
+
33
+ \n\nIn general slides will look as follows:
34
+ \n[PB] Title of Page 1
35
+ \n[T] Point 1
36
+ \n[T][T] Subpoint 1
37
+ \n[T][T] Subpoint 2
38
+ \n[T] Point 2
39
+ \n[T] Point 3
40
+ \n[PE]
41
+
42
+ \n\n[PB] Title of Page 2
43
+ \n[T] Point 4
44
+ \n[T][T] Subpoint 3
45
+ \n[T] Point 5
46
+ \n[T] Point 6
47
+ \n[T][T] Subpoint 4
48
+ \n[PE]
49
+
50
+ \n\n... # more pages to follow
51
+
52
+ [assistant]
53
+
54
+ Great! Please provide me with the title, table of contents, and abstract of your research paper.
55
+ Once I have that information, we can proceed with creating the slides together.
56
+
57
+ [user]
58
+
59
+ [data_tag_0]
60
+
61
+ [assistant]
62
+
63
+ Now that I have the title, abstract, and table of contents, please provide the first section of the paper,
64
+ [data_tag_1].
65
+ I'll create a clear and informative slide deck following the specified format.
66
+
67
+ [user]
68
+
69
+ Please adhere to the slide deck format and avoid overcrowding slides with excessive text.
70
+ Consider creating additional slides if necessary.
71
+ \nHere is the first section of the paper:
72
+ \n\n[data_tag_2]
core/chatgpt/dialogue_2.txt ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [user]
2
+
3
+ You are assisting me in creating conference slides from a research paper.
4
+ I will provide you with the slide deck format, and the current section of the paper.
5
+ Then, you will create informative slides from it.
6
+
7
+ [assistant]
8
+
9
+ Understood! Please provide me with the slide deck format to follow.
10
+
11
+ [user]
12
+
13
+ The slides you'll create should have the following format:
14
+
15
+ \n\n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
16
+ \n- Use [PE] tag to denote the end of a slide.
17
+
18
+ \n\nFor the slide content:
19
+ \n- The slide content will be formatted as a list of bullet points.
20
+ \n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
21
+ \n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
22
+ \n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
23
+ \n- Do not include [T][T][T] or more levels of indentation.
24
+
25
+ \n\nImportant!!!:
26
+ \n- Avoid creating slides that are overloaded with text.
27
+ \n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
28
+
29
+ \n\nIn general slides will look as follows:
30
+ \n[PB] Title of Page 1
31
+ \n[T] Point 1
32
+ \n[T][T] Subpoint 1
33
+ \n[T][T] Subpoint 2
34
+ \n[T] Point 2
35
+ \n[T] Point 3
36
+ \n[PE]
37
+
38
+ \n\n[PB] Title of Page 2
39
+ \n[T] Point 4
40
+ \n[T][T] Subpoint 3
41
+ \n[T] Point 5
42
+ \n[T] Point 6
43
+ \n[T][T] Subpoint 4
44
+ \n[PE]
45
+
46
+ \n\n... # more pages to follow
47
+
48
+
49
+ [assistant]
50
+
51
+ Great! Please provide me with the current section of the research paper,
52
+ I'll create a clear and informative slide deck following the specified format.
53
+
54
+ [user]
55
+
56
+ Please adhere to the slide deck format and avoid overcrowding slides with excessive text. Consider creating multiple slides if necessary. \nHere is the current section of the paper:
57
+
58
+ \n\n[data_tag_0]
core/chatgpt/dialogue_3.txt ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # there are 4 tags possible
2
+ # user, assistant, system and data_tag_i
3
+
4
+ [user]
5
+
6
+ You are assisting me in creating conference slides from a research paper.
7
+ I will provide you with the slide deck format, and the current draft of the slides.
8
+ Your task is to create clear, concise and informative slides from the draft.
9
+ Current draft contains redundant information, and you should remove it.
10
+ Slides must be interesting, engaging, and informative.
11
+ Modify the slides as you see fit, and make sure that they follow rules and formats described below.
12
+ \n\nRules:
13
+ \n- Slides may be too long, and you should split them into multiple slides.
14
+ \n- Slides may not be coherent, and you should rephrase them to make them more coherent and informative.
15
+ \n- Slides should have a flow, and you should make sure that the flow is preserved.
16
+ \n- There should not be slides with very little text.
17
+ \n- Make slides clear and concise, and avoid overcrowding them with excessive text.
18
+
19
+ [assistant]
20
+
21
+ Understood! Please provide me with the slide deck format to follow.
22
+
23
+ [user]
24
+
25
+ The slides you'll create should have the following format:
26
+
27
+ \n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
28
+ \n- Use [PE] tag to denote the end of a slide.
29
+
30
+ \n\nFor the slide content:
31
+ \n- The slide content will be formatted as a list of bullet points.
32
+ \n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
33
+ \n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
34
+ \n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
35
+ \n- Do not include [T][T][T] or more levels of indentation.
36
+
37
+ \n\nImportant!!!:
38
+ \n- Avoid creating slides that are overloaded with text.
39
+ \n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
40
+
41
+ \n\nIn general slides will look as follows:
42
+ \n[PB] Title of Page 1
43
+ \n[T] Point 1
44
+ \n[T][T] Subpoint 1
45
+ \n[T][T] Subpoint 2
46
+ \n[T] Point 2
47
+ \n[T] Point 3
48
+ \n[PE]
49
+
50
+ \n\n[PB] Title of Page 2
51
+ \n[T] Point 4
52
+ \n[T][T] Subpoint 3
53
+ \n[T] Point 5
54
+ \n[T] Point 6
55
+ \n[T][T] Subpoint 4
56
+ \n[PE]
57
+
58
+ \n\n... # more pages to follow
59
+
60
+ [assistant]
61
+ Great! Please provide me with the current draft of the slides.
62
+
63
+
64
+ [user]
65
+ Here is the current draft of the slides:
66
+ \n\n[data_tag_0]
core/chatgpt/dialogue_4.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # the following dialogue is for GPT-4 model
2
+ # there are 4 tags possible
3
+ # user, assistant, system and data_tag_i
4
+
5
+ [user]
6
+
7
+ You are assisting me in creating conference slides from a research paper.
8
+ I will provide you with the slide deck format, and the current draft of the slides.
9
+ Your task is to create clear, informative slides from the draft.
10
+ Slides must be interesting, engaging, and informative.
11
+ Modify the slides as you see fit, and make sure that they follow rules and formats described below.
12
+ \n\nRules:
13
+ \n- Slides may be too long, and you should split them into multiple slides.
14
+ \n- Slides may not be coherent, and you should rephrase them to make them more coherent and informative.
15
+ \n- Slides should have a flow, and you should make sure that the flow is preserved.
16
+ \n- There should not be slides with very little text.
17
+ \n- Make slides clear and concise, and avoid overcrowding them with excessive text.
18
+
19
+ [assistant]
20
+
21
+ Understood! Please provide me with the slide deck format to follow.
22
+
23
+ [user]
24
+
25
+ The slides you'll create should have the following format:
26
+
27
+ \n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
28
+ \n- Use [PE] tag to denote the end of a slide.
29
+
30
+ \n\nFor the slide content:
31
+ \n- The slide content will be formatted as a list of bullet points.
32
+ \n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
33
+ \n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
34
+ \n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
35
+ \n- Do not include [T][T][T] or more levels of indentation.
36
+
37
+ \n\nImportant!!!:
38
+ \n- Avoid creating slides that are overloaded with text.
39
+ \n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
40
+
41
+ \n\nIn general slides will look as follows:
42
+ \n[PB] Title of Page 1
43
+ \n[T] Point 1
44
+ \n[T][T] Subpoint 1
45
+ \n[T][T] Subpoint 2
46
+ \n[T] Point 2
47
+ \n[T] Point 3
48
+ \n[PE]
49
+ /.. /
50
+ /.'
51
+ .'
52
+ .'
53
+
54
+ \n\n[PB] Title of Page 2
55
+ \n[T] Point 4
56
+ \n[T][T] Subpoint 3
57
+ \n[T] Point 5
58
+ \n[T] Point 6
59
+ \n[T][T] Subpoint 4
60
+ \n[PE]
61
+
62
+ \n\n... # more pages to follow
63
+
64
+ [assistant]
65
+ Great! Please provide me with the current draft of the slides.
66
+
67
+
68
+ [user]
69
+ Here is the current draft of the slides:
70
+ \n\n[data_tag_0]
core/chatgpt/generate_slides.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+
5
+ import numpy as np
6
+ import openai
7
+ import pandas as pd
8
+ import spacy
9
+ import tqdm
10
+ from tqdm import tqdm
11
+
12
+ from .utils import get_num_tokens, parse_prompt, num_tokens_from_messages, clean_slides, slide_generation_ver2, generate_latex_slide
13
+
14
+ nlp = spacy.load('en_core_web_sm')
15
+
16
+
17
+ def set_openai_api_key(key: str):
18
+ openai.api_key = 'key'
19
+
20
+
21
+ def generate_slide(json_pth: str):
22
+
23
+ model_list = [model['id'] for model in openai.Model.list()['data']]
24
+ gpt4_id = "gpt-4-0314"
25
+ gpt3_id = 'gpt-3.5-turbo-0301'
26
+
27
+ with open(json_pth) as f:
28
+ data = json.load(f)
29
+
30
+ title = data['title']
31
+ abstract = data['abstract']
32
+ paper_length = len(data['text'])
33
+ sections = [[head['section'], ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))])] for head in data['headers']]
34
+ figures = [fig['caption'] for fig in data['figures']]
35
+
36
+ ### ! Split the sections by chunks with token_limit
37
+ new_sections = []
38
+ toc = ""
39
+ token_limit = 1400
40
+
41
+ for section in sections:
42
+ section_title = section[0]
43
+ curr_count = get_num_tokens(section[1])
44
+
45
+ toc += section_title + "; "
46
+
47
+ if curr_count > token_limit:
48
+ # split the section into sentences
49
+ sents = nlp(section[1]).sents
50
+
51
+ temp_list = []
52
+ for sent in sents:
53
+ if not temp_list:
54
+ temp_list.append(sent.text)
55
+ continue
56
+ curr_count = get_num_tokens(temp_list[-1])
57
+ if curr_count + get_num_tokens(sent.text) < token_limit:
58
+ temp_list[-1] += sent.text
59
+ else:
60
+ temp_list.append(sent.text)
61
+
62
+ for i in range(len(temp_list)):
63
+ if i == 0:
64
+ new_sections.append([section_title, temp_list[i]])
65
+ else:
66
+ new_sections.append([section_title + " (cont.)", temp_list[i]])
67
+ else:
68
+ new_sections.append(section)
69
+
70
+ print(f"Total number of sections: {len(new_sections)}")
71
+
72
+ # ! get the initial message
73
+ initial_user_message = "Title: " + title + "\nTable of Contents: " + toc + "\nAbstract: " + abstract
74
+ initial_section_title = new_sections[0][0]
75
+ initial_section_content = new_sections[0][1]
76
+
77
+ # ! initial dialogue, Generates slides for the first section of the research paper.
78
+ res = []
79
+ data = [initial_user_message, initial_section_title, initial_section_content]
80
+ messages = parse_prompt("./dialogue_1.txt", data)
81
+ token_length = num_tokens_from_messages(messages)
82
+
83
+ assert token_length < 2400, f"Message is too long: {token_length}"
84
+
85
+ response = openai.ChatCompletion.create(
86
+ model=gpt3_id,
87
+ messages=messages,
88
+ temperature=0.5,
89
+ )
90
+ answer = response["choices"][0]["message"]["content"]
91
+ res.append(answer)
92
+ time.sleep(10)
93
+
94
+ ### ! Following dialogue. Generates slides for the following sections of the research paper.
95
+ for i, (section_title, section_content) in enumerate(new_sections[1:]):
96
+ print(f"Section {i+1}: {section_title} is being processed...")
97
+
98
+ data = [section_content]
99
+ messages = parse_prompt("./dialogue_2.txt", data)
100
+
101
+ token_length = num_tokens_from_messages(messages)
102
+ assert token_length < 2400, f"Message is too long: {token_length}"
103
+
104
+ response = openai.ChatCompletion.create(
105
+ model=gpt3_id,
106
+ messages=messages,
107
+ temperature=0.9,
108
+ )
109
+ answer = response["choices"][0]["message"]["content"]
110
+ res.append(answer)
111
+
112
+ del messages, token_length, response, answer
113
+ time.sleep(10) # sleep for 10 seconds to avoid API limit
114
+
115
+ ### ! Clean slides from comments, empty lines and other garbage
116
+ for i in range(len(res)):
117
+ res[i] = clean_slides(res[i])
118
+
119
+ temp_res = res
120
+ prev_cnt = len(temp_res)
121
+
122
+ while len(temp_res) > 1:
123
+ temp_num_tokens = get_num_tokens("\n".join(temp_res))
124
+ temp_res = slide_generation_ver2(temp_res, 1800)
125
+ print(f"The length of res is {len(temp_res)}, and the number of tokens is {temp_num_tokens}")
126
+
127
+ # if the number of slides is not changed then break
128
+ if len(temp_res) == prev_cnt:
129
+ break
130
+ else:
131
+ prev_cnt = len(temp_res)
132
+
133
+ # if the number of tokens is less than 4000 then break
134
+ if temp_num_tokens <= 4000:
135
+ break
136
+
137
+ new_res = []
138
+ for i in tqdm(range(len(temp_res))):
139
+ data = [temp_res[i]]
140
+ messages = parse_prompt("./dialogue_3.txt", data)
141
+
142
+ token_length = num_tokens_from_messages(messages)
143
+ assert token_length < 2400, f"Message is too long: {token_length}"
144
+
145
+ response = openai.ChatCompletion.create(
146
+ model=gpt3_id,
147
+ messages=messages,
148
+ temperature=0.9,
149
+ )
150
+
151
+ temp = response["choices"][0]["message"]["content"]
152
+ temp = clean_slides(temp)
153
+
154
+ new_res.append(temp)
155
+ time.sleep(5) # needed to avoid API limit
156
+
157
+ temp_res = new_res
158
+
159
+ time.sleep(10) # needed to avoid API limit
160
+
161
+ # ! final refinement
162
+ final_draft = "\n".join(temp_res)
163
+ data = [final_draft]
164
+ messages = parse_prompt("./dialogue_4.txt", data)
165
+
166
+ print(num_tokens_from_messages(messages))
167
+
168
+ response = openai.ChatCompletion.create(
169
+ model=gpt4_id if gpt4_id in model_list else gpt3_id,
170
+ messages=messages,
171
+ temperature=0.5,
172
+ )
173
+
174
+ temp = response["choices"][0]["message"]["content"]
175
+
176
+ # generate_latex_slide(temp, "test.tex")
177
+
178
+ return temp
core/chatgpt/utils.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+
3
+ from typing import Dict, Tuple, List
4
+
5
+
6
+ def slide_generation(res, num_tokens_limit=1800):
7
+ new_res = [res[0]]
8
+
9
+ for i in range(1, len(res)):
10
+ if not res[i]:
11
+ continue
12
+ prev_cnt = get_num_tokens(new_res[-1])
13
+ curr_cnt = get_num_tokens(res[i])
14
+ if prev_cnt + curr_cnt < num_tokens_limit:
15
+ new_res[-1] += res[i]
16
+ else:
17
+ new_res.append(res[i])
18
+ return new_res
19
+
20
+
21
+ def slide_generation_ver2(res, num_tokens_limit=1800):
22
+ text = "\n".join(res).split("[PE]")
23
+
24
+ text = [(t.strip() + "\n[PE]\n") if t else "" for t in text]
25
+ return slide_generation(text, num_tokens_limit=num_tokens_limit)
26
+
27
+
28
+ def parse_prompt(file: str, data: List[str] = None):
29
+ roles = []
30
+ contents = []
31
+
32
+ file = open(file, "r")
33
+ for line in file.readlines():
34
+ # if line is empty or a comment, skip
35
+ if "#" in line or not line.strip():
36
+ continue
37
+ if "[user]" in line:
38
+ roles.append("user")
39
+ contents.append([])
40
+ continue
41
+ elif "[assistant]" in line:
42
+ roles.append("assistant")
43
+ contents.append([])
44
+ continue
45
+ elif "[system]" in line:
46
+ roles.append("system")
47
+ contents.append([])
48
+ continue
49
+ if line.strip():
50
+ assert roles, "No role specified"
51
+ contents[-1].append(line.strip())
52
+
53
+ # checking roles
54
+ assert roles[0] in ["user", "system"], "First role must be user or system"
55
+ for i in range(1, len(roles)):
56
+ assert roles[i] in ["user", "assistant"], "Roles must be user or assistant"
57
+ assert roles[i] != roles[i - 1], "Roles must alternate between user and assistant"
58
+
59
+ contents_str = []
60
+ for content in contents:
61
+ contents_str.append(" ".join(content))
62
+
63
+ curr_idx = 0
64
+ for i in range(len(contents_str)):
65
+ tag = f"[data_tag_{curr_idx}]"
66
+ # replace \n with newline
67
+ contents_str[i] = contents_str[i].replace("\\n", "\n")
68
+ if tag in contents_str[i]:
69
+ contents_str[i] = contents_str[i].replace(tag, data[curr_idx])
70
+ curr_idx += 1
71
+ assert curr_idx == len(data), "Not all data tags were replaced"
72
+
73
+ messages = []
74
+ for i in range(len(roles)):
75
+ messages.append({"role": roles[i], "content": contents_str[i]})
76
+
77
+ return messages
78
+
79
+
80
+ def clean_slides(slide):
81
+ slide_list = slide.split('\n')
82
+ clean_slide_list = []
83
+ for line in slide_list:
84
+ if line[:3] == '[F]' or line[:3] == '[T]' or line[:6] == '[T][T]' or line[:4] == '[PB]' or line[:4] == '[PE]':
85
+ clean_slide_list.append(line)
86
+ return '\n'.join(clean_slide_list)
87
+
88
+
89
+ def generate_latex_slide(slide, output_path=None):
90
+ # Initialize the Beamer document
91
+ latex_code = "\\documentclass{beamer} \n\\begin{document}"
92
+
93
+ # Split the slide string into pages
94
+ pages = slide.split('[PB]')[1:]
95
+
96
+ # Iterate through each page
97
+ for i, page in enumerate(pages):
98
+ tmp_list = [None, None] # [title, content]
99
+
100
+ page = page.strip()
101
+
102
+ print(i, page)
103
+
104
+ # Extract the page title and content
105
+ title_end_index = page.index("\n") + 1
106
+ title = page[:title_end_index].strip()
107
+ content_end_index = page.index("[PE]")
108
+ content = page[title_end_index:content_end_index].strip()
109
+
110
+ # Start a new frame with the page title
111
+ if title:
112
+ tmp_list[0] = f"\n\\begin{{frame}}{{{title}}}\n\n"
113
+
114
+ # Split the content into list items
115
+ items = content.split('\n')
116
+
117
+ p = []
118
+ for item in items:
119
+ if not item:
120
+ break
121
+ # print(item)
122
+ if '[T][T]' in item:
123
+ assert len(p) > 0, "Subpoint cannot be the first item in a page"
124
+ subpoints = item.split('[T][T]')[1]
125
+ p[-1].append(subpoints)
126
+ else:
127
+ if '[T]' in item:
128
+ point = item.split('[T]')[1]
129
+ else:
130
+ point = item
131
+ p.append([point])
132
+
133
+ if p:
134
+ # Add each item as a Beamer itemize element
135
+ tmp_list[1] = "\\begin{itemize}\n"
136
+ for point in p:
137
+ if not point:
138
+ break
139
+ tmp_list[1] += f"\\item {point[0]}\n"
140
+ if len(point) > 1:
141
+ tmp_list[1] += "\\begin{itemize}\n"
142
+ for subpoint in point[1:]:
143
+ tmp_list[1] += f"\\item {subpoint}\n"
144
+ tmp_list[1] += "\\end{itemize}\n"
145
+ tmp_list[1] += "\\end{itemize}\n"
146
+
147
+ if tmp_list[0] is None and tmp_list[1] is None:
148
+ # The page is empty, so skip it
149
+ if i == len(pages) - 1:
150
+ # This is the last page, so end the document instead of the frame
151
+ latex_code += "\n\\end{document}"
152
+ break
153
+
154
+ tmp_list[1] += "\n\\end{frame}\n"
155
+ # End the frame
156
+ if i == len(pages) - 1:
157
+ # This is the last page, so end the document instead of the frame
158
+ tmp_list[1] += "\n\\end{document}"
159
+
160
+ latex_code += "".join(tmp_list)
161
+
162
+ latex_code = latex_code.replace('_', '\_').replace('&', '\&').replace('^', '\^').replace('$', '\$')
163
+ if output_path:
164
+ with open(output_path, 'w') as f:
165
+ f.write(latex_code)
166
+
167
+
168
+ def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
169
+ """
170
+ Returns the number of tokens required to encode the given messages.
171
+
172
+ source: https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/chatgpt?pivots=programming-language-chat-completions#managing-conversations
173
+ """
174
+ encoding = tiktoken.encoding_for_model(model)
175
+ num_tokens = 0
176
+ for message in messages:
177
+ num_tokens += 4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
178
+ for key, value in message.items():
179
+ num_tokens += len(encoding.encode(value))
180
+ if key == "name": # if there's a name, the role is omitted
181
+ num_tokens += -1 # role is always required and always 1 token
182
+ num_tokens += 2 # every reply is primed with <im_start>assistant
183
+ return num_tokens
184
+
185
+
186
+ def get_num_tokens(message, model="gpt-3.5-turbo-0301"):
187
+ encoding = tiktoken.encoding_for_model(model)
188
+ num_tokens = 0
189
+ num_tokens += len(encoding.encode(message))
190
+ return num_tokens
191
+
192
+
193
+ def get_paper_text_in_chunks(example, chunk_size=4000):
194
+ paper_length = len(example['paper']['text'])
195
+
196
+ title = '[TB] ' + example['title'] + ' [TE] '
197
+ abstract = '[AB] ' + example['paper']['abstract'] + ' [AE] '
198
+
199
+ sections = [' [SB] ' + head['n'] + ' ' + head['section'] + ' [SC] ' + ' '.join([example['paper']['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]) + ' [SE] ' for head in example['paper']['headers']]
200
+ figures = [' [FB] ' + fig['caption'] + ' [FE] ' for fig in example['paper']['figures']]
201
+
202
+ chunks = []
203
+
204
+ temp_chunk = title + abstract
205
+ temp_chunk_length = get_num_tokens(temp_chunk)
206
+
207
+ for s in sections + figures:
208
+ assert get_num_tokens(s) < chunk_size, "Section or figure is too long to fit in a chunk"
209
+ if temp_chunk_length + get_num_tokens(s) > chunk_size:
210
+ chunks.append(temp_chunk)
211
+ temp_chunk = s
212
+ temp_chunk_length = get_num_tokens(s)
213
+ else:
214
+ temp_chunk += s
215
+ temp_chunk_length += get_num_tokens(s)
216
+
217
+ if temp_chunk_length > 0:
218
+ chunks.append(temp_chunk)
219
+
220
+ return chunks
core/tei.py CHANGED
@@ -99,13 +99,15 @@ class TEIFile(object):
99
  if head.parent.name == 'div':
100
  txt = head.parent.get_text(separator=' ', strip=True)
101
  # the following is only valid for arabic numerals...
102
- if head.get("n"):
103
- sections.append([head.text, head.get('n'), txt])
104
- else:
105
- if len(sections) == 0:
106
- print("Grobid processing error.")
107
- sections[-1][2] += txt
108
- # sections.append([head.text, 'invalid n', txt])
 
 
109
  start = 0
110
  for i in sections:
111
  sent = nltk.tokenize.sent_tokenize(i[2])
 
99
  if head.parent.name == 'div':
100
  txt = head.parent.get_text(separator=' ', strip=True)
101
  # the following is only valid for arabic numerals...
102
+ try:
103
+ if head.get("n"):
104
+ sections.append([head.text, head.get('n'), txt])
105
+ else:
106
+ if len(sections) == 0:
107
+ print("Grobid processing error.")
108
+ sections[-1][2] += txt
109
+ except:
110
+ sections.append([head.text, 'invalid n', txt])
111
  start = 0
112
  for i in sections:
113
  sent = nltk.tokenize.sent_tokenize(i[2])
requirements.txt CHANGED
@@ -7,6 +7,7 @@ lxml
7
  # for interaction with openai
8
  openai
9
  tiktoken
 
10
  # for shiny
11
  anyio==3.6.2
12
  appdirs==1.4.4
 
7
  # for interaction with openai
8
  openai
9
  tiktoken
10
+ spacy
11
  # for shiny
12
  anyio==3.6.2
13
  appdirs==1.4.4
shiny_example_dockerfile DELETED
@@ -1,13 +0,0 @@
1
- FROM python:3.9
2
-
3
- WORKDIR /code
4
-
5
- COPY ./requirements.txt /code/requirements.txt
6
-
7
- RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
-
9
- COPY . .
10
-
11
- EXPOSE 7860
12
-
13
- CMD ["shiny", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]