Spaces:
Sleeping
Sleeping
add post-process file
Browse files- Dockerfile +4 -0
- app.py +111 -15
- core/chatgpt/dialogue_1.txt +72 -0
- core/chatgpt/dialogue_2.txt +58 -0
- core/chatgpt/dialogue_3.txt +66 -0
- core/chatgpt/dialogue_4.txt +70 -0
- core/chatgpt/generate_slides.py +178 -0
- core/chatgpt/utils.py +220 -0
- core/tei.py +9 -7
- requirements.txt +1 -0
- shiny_example_dockerfile +0 -13
Dockerfile
CHANGED
@@ -34,6 +34,10 @@ RUN python core/init_sbt.py
|
|
34 |
# add app
|
35 |
ADD ./app.py /project/app.py
|
36 |
EXPOSE 7860
|
|
|
|
|
|
|
|
|
37 |
|
38 |
# add code
|
39 |
ADD ./core/ /project/core/
|
|
|
34 |
# add app
|
35 |
ADD ./app.py /project/app.py
|
36 |
EXPOSE 7860
|
37 |
+
# creat log dir for grobid
|
38 |
+
RUN mkdir /opt/grobid/logs
|
39 |
+
# downlaod en_core_web_sm
|
40 |
+
RUN python -m spacy download en_core_web_sm
|
41 |
|
42 |
# add code
|
43 |
ADD ./core/ /project/core/
|
app.py
CHANGED
@@ -5,6 +5,7 @@ from subprocess import call
|
|
5 |
from shiny import App, reactive, render, ui
|
6 |
|
7 |
from core.read_pdf import process_pdf, temp_dir
|
|
|
8 |
|
9 |
last_pdf_md5_preprocess_stage = None
|
10 |
|
@@ -29,12 +30,9 @@ def ui_card(title, *args):
|
|
29 |
app_ui = ui.page_fluid(
|
30 |
ui.h1("Document2Slide Demo"),
|
31 |
ui_card(
|
32 |
-
"Upload PDF",
|
33 |
-
ui.input_file("input_pdf", "Choose a .pdf file to upload:", multiple=
|
34 |
ui.output_text("upload_file_status", ),
|
35 |
-
),
|
36 |
-
ui_card(
|
37 |
-
"Preprocess",
|
38 |
ui.p(
|
39 |
ui.input_action_button("preprocess_action", "Preprocess file", class_="btn-primary"),
|
40 |
ui.output_text("preprocess_result", ),
|
@@ -42,17 +40,21 @@ app_ui = ui.page_fluid(
|
|
42 |
ui.output_text("preprocess_status", ),
|
43 |
ui.download_button("download_preprocessed", "Download preprocessed file"),
|
44 |
),
|
|
|
45 |
ui_card(
|
46 |
-
"
|
47 |
-
ui.
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
56 |
),
|
57 |
)
|
58 |
|
@@ -129,5 +131,99 @@ def server(input, output, session):
|
|
129 |
call(args, cwd=temp_dir)
|
130 |
return str(os.path.join(temp_dir, file_name + '.zip'))
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
app = App(app_ui, server)
|
|
|
5 |
from shiny import App, reactive, render, ui
|
6 |
|
7 |
from core.read_pdf import process_pdf, temp_dir
|
8 |
+
from core.chatgpt.utils import generate_latex_slide
|
9 |
|
10 |
last_pdf_md5_preprocess_stage = None
|
11 |
|
|
|
30 |
app_ui = ui.page_fluid(
|
31 |
ui.h1("Document2Slide Demo"),
|
32 |
ui_card(
|
33 |
+
"Upload PDF and Preprocess",
|
34 |
+
ui.input_file("input_pdf", "Choose a .pdf file to upload:", multiple=False),
|
35 |
ui.output_text("upload_file_status", ),
|
|
|
|
|
|
|
36 |
ui.p(
|
37 |
ui.input_action_button("preprocess_action", "Preprocess file", class_="btn-primary"),
|
38 |
ui.output_text("preprocess_result", ),
|
|
|
40 |
ui.output_text("preprocess_status", ),
|
41 |
ui.download_button("download_preprocessed", "Download preprocessed file"),
|
42 |
),
|
43 |
+
ui.h3("Due to gpt-4's unreliable service, we choose to show our demo locally. You can refer to ./core/chatgpt/generate_slides.py for this pipeline."),
|
44 |
ui_card(
|
45 |
+
"Upload the generated bullet points in pre-defined format.",
|
46 |
+
ui.input_file("input_bullet", "Choose a .tex bullet-point file to upload:", multiple=False),
|
47 |
+
ui.output_text("upload_bullet_status", ),
|
48 |
+
ui.p(
|
49 |
+
ui.input_action_button("process_bullet", "Generate .tex", class_="btn-primary"),
|
50 |
+
ui.output_text("process_bullet_result", ),
|
51 |
+
),
|
52 |
+
ui.p(ui.download_button("download_beamer", "Download beamer source code")),
|
53 |
+
ui.p(
|
54 |
+
ui.input_action_button("complie_latex", "Compile the latex file generated before.", class_="btn-primary"),
|
55 |
+
ui.output_text("complie_latex_result", ),
|
56 |
+
),
|
57 |
+
ui.p(ui.download_button("download_slide", "Download slide generated")),
|
58 |
),
|
59 |
)
|
60 |
|
|
|
131 |
call(args, cwd=temp_dir)
|
132 |
return str(os.path.join(temp_dir, file_name + '.zip'))
|
133 |
|
134 |
+
def upload_file_status():
|
135 |
+
file_infos = input.input_bullet()
|
136 |
+
# print(file_infos) # [{'name': 'Poster.pdf', 'size': 598394, 'type': 'application/pdf', 'datapath': '/tmp/fileupload-2c21fv0a/tmpi91sy07h/0.pdf'}]
|
137 |
+
if not file_infos:
|
138 |
+
return "There is no file provided currently."
|
139 |
+
elif file_infos[0]['type'] != 'text/plain':
|
140 |
+
return "the file you provide is not in txt format, upload another one!"
|
141 |
+
else:
|
142 |
+
return "txt file successfully uploaded!"
|
143 |
+
|
144 |
+
@output
|
145 |
+
@render.text
|
146 |
+
@reactive.event(input.process_bullet) # Take a dependency on the button
|
147 |
+
async def process_bullet_result():
|
148 |
+
|
149 |
+
file_infos = input.input_bullet()
|
150 |
+
file_name = file_infos[0]['name'] if file_infos else None
|
151 |
+
|
152 |
+
if (file_infos is not None) and file_infos[0]['type'] == 'text/plain':
|
153 |
+
|
154 |
+
txt_pth = file_infos[0]['datapath']
|
155 |
+
|
156 |
+
try:
|
157 |
+
with open(txt_pth, 'r') as f:
|
158 |
+
slide = f.read()
|
159 |
+
output_tex_pth = str(os.path.join(temp_dir, file_name[:-4] + '.tex'))
|
160 |
+
if not os.path.exists(temp_dir):
|
161 |
+
os.makedirs(temp_dir)
|
162 |
+
|
163 |
+
generate_latex_slide(slide, output_tex_pth)
|
164 |
+
|
165 |
+
return "Generate .tex file successful!"
|
166 |
+
except:
|
167 |
+
return "Something run happened please which to another file!"
|
168 |
+
else:
|
169 |
+
return "No .txt provided, please upload one!"
|
170 |
+
|
171 |
+
@session.download()
|
172 |
+
def download_beamer():
|
173 |
+
|
174 |
+
file_infos = input.input_bullet()
|
175 |
+
|
176 |
+
if not file_infos:
|
177 |
+
return
|
178 |
+
file_name = file_infos[0]['name']
|
179 |
+
tex_pth = str(os.path.join(temp_dir, file_name[:-4] + '.tex'))
|
180 |
+
|
181 |
+
if not os.path.exists(tex_pth):
|
182 |
+
return
|
183 |
+
else:
|
184 |
+
return tex_pth
|
185 |
+
|
186 |
+
@output
|
187 |
+
@render.text
|
188 |
+
@reactive.event(input.complie_latex) # Take a dependency on the button
|
189 |
+
async def complie_latex_result():
|
190 |
+
|
191 |
+
file_infos = input.input_bullet()
|
192 |
+
if not file_infos:
|
193 |
+
return "No file uploaded yet!"
|
194 |
+
|
195 |
+
file_name = file_infos[0]['name']
|
196 |
+
tex_pth = str(os.path.join(temp_dir, file_name[:-4] + '.tex'))
|
197 |
+
|
198 |
+
if not os.path.exists(tex_pth):
|
199 |
+
return "No .tex file yet, please upload a .txt bullet point file and convert it to beamer tex."
|
200 |
+
|
201 |
+
tex_file_name = tex_pth.split('/')[-1]
|
202 |
+
|
203 |
+
args = ["latexmk", "-xelatex", tex_file_name]
|
204 |
+
|
205 |
+
return_code = call(args, cwd=temp_dir)
|
206 |
+
|
207 |
+
if return_code == 0:
|
208 |
+
return "Compile sucessful!"
|
209 |
+
else:
|
210 |
+
return "Compile fail!"
|
211 |
+
|
212 |
+
@session.download()
|
213 |
+
def download_slide():
|
214 |
+
|
215 |
+
file_infos = input.input_bullet()
|
216 |
+
|
217 |
+
if not file_infos:
|
218 |
+
return
|
219 |
+
|
220 |
+
file_name = file_infos[0]['name']
|
221 |
+
pdf_pth = str(os.path.join(temp_dir, file_name[:-4] + '.pdf'))
|
222 |
+
|
223 |
+
if not os.path.exists(pdf_pth):
|
224 |
+
return
|
225 |
+
else:
|
226 |
+
return pdf_pth
|
227 |
+
|
228 |
|
229 |
app = App(app_ui, server)
|
core/chatgpt/dialogue_1.txt
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[user]
|
2 |
+
|
3 |
+
You will assist me in creating conference slides from a research paper.
|
4 |
+
I will provide you with the slide deck format, title, table of contents, and abstract.
|
5 |
+
Afterwards, I will provide you with the first section of the paper.
|
6 |
+
In the future, I will provide you with the rest of the sections.
|
7 |
+
Ensure the slide deck is clear, informative, and easy to follow.
|
8 |
+
|
9 |
+
[assistant]
|
10 |
+
|
11 |
+
I'm happy to help you create conference slides from your research paper.
|
12 |
+
To get started, please share the slide deck format with me.
|
13 |
+
|
14 |
+
[user]
|
15 |
+
|
16 |
+
The slides you'll create should have the following format:
|
17 |
+
|
18 |
+
\n\n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
|
19 |
+
\n- Use [PE] tag to denote the end of a slide.
|
20 |
+
|
21 |
+
\n\nFor the slide content:
|
22 |
+
\n- The slide content will be formatted as a list of bullet points.
|
23 |
+
\n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
|
24 |
+
\n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
|
25 |
+
\n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
|
26 |
+
\n- You can include multiple [T] and [T][T] points on each page, but try to keep it to a reasonable amount to avoid clutter.
|
27 |
+
\n- 3-4 bullet points (lines beginning with [T]) per page is a good rule of thumb!!!
|
28 |
+
|
29 |
+
\n\nImportant!!!:
|
30 |
+
\n- Avoid creating slides that are overloaded with text.
|
31 |
+
\n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
|
32 |
+
|
33 |
+
\n\nIn general slides will look as follows:
|
34 |
+
\n[PB] Title of Page 1
|
35 |
+
\n[T] Point 1
|
36 |
+
\n[T][T] Subpoint 1
|
37 |
+
\n[T][T] Subpoint 2
|
38 |
+
\n[T] Point 2
|
39 |
+
\n[T] Point 3
|
40 |
+
\n[PE]
|
41 |
+
|
42 |
+
\n\n[PB] Title of Page 2
|
43 |
+
\n[T] Point 4
|
44 |
+
\n[T][T] Subpoint 3
|
45 |
+
\n[T] Point 5
|
46 |
+
\n[T] Point 6
|
47 |
+
\n[T][T] Subpoint 4
|
48 |
+
\n[PE]
|
49 |
+
|
50 |
+
\n\n... # more pages to follow
|
51 |
+
|
52 |
+
[assistant]
|
53 |
+
|
54 |
+
Great! Please provide me with the title, table of contents, and abstract of your research paper.
|
55 |
+
Once I have that information, we can proceed with creating the slides together.
|
56 |
+
|
57 |
+
[user]
|
58 |
+
|
59 |
+
[data_tag_0]
|
60 |
+
|
61 |
+
[assistant]
|
62 |
+
|
63 |
+
Now that I have the title, abstract, and table of contents, please provide the first section of the paper,
|
64 |
+
[data_tag_1].
|
65 |
+
I'll create a clear and informative slide deck following the specified format.
|
66 |
+
|
67 |
+
[user]
|
68 |
+
|
69 |
+
Please adhere to the slide deck format and avoid overcrowding slides with excessive text.
|
70 |
+
Consider creating additional slides if necessary.
|
71 |
+
\nHere is the first section of the paper:
|
72 |
+
\n\n[data_tag_2]
|
core/chatgpt/dialogue_2.txt
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[user]
|
2 |
+
|
3 |
+
You are assisting me in creating conference slides from a research paper.
|
4 |
+
I will provide you with the slide deck format, and the current section of the paper.
|
5 |
+
Then, you will create informative slides from it.
|
6 |
+
|
7 |
+
[assistant]
|
8 |
+
|
9 |
+
Understood! Please provide me with the slide deck format to follow.
|
10 |
+
|
11 |
+
[user]
|
12 |
+
|
13 |
+
The slides you'll create should have the following format:
|
14 |
+
|
15 |
+
\n\n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
|
16 |
+
\n- Use [PE] tag to denote the end of a slide.
|
17 |
+
|
18 |
+
\n\nFor the slide content:
|
19 |
+
\n- The slide content will be formatted as a list of bullet points.
|
20 |
+
\n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
|
21 |
+
\n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
|
22 |
+
\n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
|
23 |
+
\n- Do not include [T][T][T] or more levels of indentation.
|
24 |
+
|
25 |
+
\n\nImportant!!!:
|
26 |
+
\n- Avoid creating slides that are overloaded with text.
|
27 |
+
\n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
|
28 |
+
|
29 |
+
\n\nIn general slides will look as follows:
|
30 |
+
\n[PB] Title of Page 1
|
31 |
+
\n[T] Point 1
|
32 |
+
\n[T][T] Subpoint 1
|
33 |
+
\n[T][T] Subpoint 2
|
34 |
+
\n[T] Point 2
|
35 |
+
\n[T] Point 3
|
36 |
+
\n[PE]
|
37 |
+
|
38 |
+
\n\n[PB] Title of Page 2
|
39 |
+
\n[T] Point 4
|
40 |
+
\n[T][T] Subpoint 3
|
41 |
+
\n[T] Point 5
|
42 |
+
\n[T] Point 6
|
43 |
+
\n[T][T] Subpoint 4
|
44 |
+
\n[PE]
|
45 |
+
|
46 |
+
\n\n... # more pages to follow
|
47 |
+
|
48 |
+
|
49 |
+
[assistant]
|
50 |
+
|
51 |
+
Great! Please provide me with the current section of the research paper,
|
52 |
+
I'll create a clear and informative slide deck following the specified format.
|
53 |
+
|
54 |
+
[user]
|
55 |
+
|
56 |
+
Please adhere to the slide deck format and avoid overcrowding slides with excessive text. Consider creating multiple slides if necessary. \nHere is the current section of the paper:
|
57 |
+
|
58 |
+
\n\n[data_tag_0]
|
core/chatgpt/dialogue_3.txt
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# there are 4 tags possible
|
2 |
+
# user, assistant, system and data_tag_i
|
3 |
+
|
4 |
+
[user]
|
5 |
+
|
6 |
+
You are assisting me in creating conference slides from a research paper.
|
7 |
+
I will provide you with the slide deck format, and the current draft of the slides.
|
8 |
+
Your task is to create clear, concise and informative slides from the draft.
|
9 |
+
Current draft contains redundant information, and you should remove it.
|
10 |
+
Slides must be interesting, engaging, and informative.
|
11 |
+
Modify the slides as you see fit, and make sure that they follow rules and formats described below.
|
12 |
+
\n\nRules:
|
13 |
+
\n- Slides may be too long, and you should split them into multiple slides.
|
14 |
+
\n- Slides may not be coherent, and you should rephrase them to make them more coherent and informative.
|
15 |
+
\n- Slides should have a flow, and you should make sure that the flow is preserved.
|
16 |
+
\n- There should not be slides with very little text.
|
17 |
+
\n- Make slides clear and concise, and avoid overcrowding them with excessive text.
|
18 |
+
|
19 |
+
[assistant]
|
20 |
+
|
21 |
+
Understood! Please provide me with the slide deck format to follow.
|
22 |
+
|
23 |
+
[user]
|
24 |
+
|
25 |
+
The slides you'll create should have the following format:
|
26 |
+
|
27 |
+
\n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
|
28 |
+
\n- Use [PE] tag to denote the end of a slide.
|
29 |
+
|
30 |
+
\n\nFor the slide content:
|
31 |
+
\n- The slide content will be formatted as a list of bullet points.
|
32 |
+
\n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
|
33 |
+
\n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
|
34 |
+
\n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
|
35 |
+
\n- Do not include [T][T][T] or more levels of indentation.
|
36 |
+
|
37 |
+
\n\nImportant!!!:
|
38 |
+
\n- Avoid creating slides that are overloaded with text.
|
39 |
+
\n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
|
40 |
+
|
41 |
+
\n\nIn general slides will look as follows:
|
42 |
+
\n[PB] Title of Page 1
|
43 |
+
\n[T] Point 1
|
44 |
+
\n[T][T] Subpoint 1
|
45 |
+
\n[T][T] Subpoint 2
|
46 |
+
\n[T] Point 2
|
47 |
+
\n[T] Point 3
|
48 |
+
\n[PE]
|
49 |
+
|
50 |
+
\n\n[PB] Title of Page 2
|
51 |
+
\n[T] Point 4
|
52 |
+
\n[T][T] Subpoint 3
|
53 |
+
\n[T] Point 5
|
54 |
+
\n[T] Point 6
|
55 |
+
\n[T][T] Subpoint 4
|
56 |
+
\n[PE]
|
57 |
+
|
58 |
+
\n\n... # more pages to follow
|
59 |
+
|
60 |
+
[assistant]
|
61 |
+
Great! Please provide me with the current draft of the slides.
|
62 |
+
|
63 |
+
|
64 |
+
[user]
|
65 |
+
Here is the current draft of the slides:
|
66 |
+
\n\n[data_tag_0]
|
core/chatgpt/dialogue_4.txt
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# the following dialogue is for GPT-4 model
|
2 |
+
# there are 4 tags possible
|
3 |
+
# user, assistant, system and data_tag_i
|
4 |
+
|
5 |
+
[user]
|
6 |
+
|
7 |
+
You are assisting me in creating conference slides from a research paper.
|
8 |
+
I will provide you with the slide deck format, and the current draft of the slides.
|
9 |
+
Your task is to create clear, informative slides from the draft.
|
10 |
+
Slides must be interesting, engaging, and informative.
|
11 |
+
Modify the slides as you see fit, and make sure that they follow rules and formats described below.
|
12 |
+
\n\nRules:
|
13 |
+
\n- Slides may be too long, and you should split them into multiple slides.
|
14 |
+
\n- Slides may not be coherent, and you should rephrase them to make them more coherent and informative.
|
15 |
+
\n- Slides should have a flow, and you should make sure that the flow is preserved.
|
16 |
+
\n- There should not be slides with very little text.
|
17 |
+
\n- Make slides clear and concise, and avoid overcrowding them with excessive text.
|
18 |
+
|
19 |
+
[assistant]
|
20 |
+
|
21 |
+
Understood! Please provide me with the slide deck format to follow.
|
22 |
+
|
23 |
+
[user]
|
24 |
+
|
25 |
+
The slides you'll create should have the following format:
|
26 |
+
|
27 |
+
\n- Use [PB] tag to denote the start of a new slide, followed by the slide title.
|
28 |
+
\n- Use [PE] tag to denote the end of a slide.
|
29 |
+
|
30 |
+
\n\nFor the slide content:
|
31 |
+
\n- The slide content will be formatted as a list of bullet points.
|
32 |
+
\n- Avoid using '*' or '-' for to list items, instead use special tokens [T] and [T][T] as described below.
|
33 |
+
\n- Use [T] at the beginning of a new line to create a first-level itemized symbol.
|
34 |
+
\n- Use [T][T] to create a second-level itemized symbol with an indent for subpoints.
|
35 |
+
\n- Do not include [T][T][T] or more levels of indentation.
|
36 |
+
|
37 |
+
\n\nImportant!!!:
|
38 |
+
\n- Avoid creating slides that are overloaded with text.
|
39 |
+
\n- Try to keep the text to a minimum, and use bullet points and sub-bullets to convey main information.
|
40 |
+
|
41 |
+
\n\nIn general slides will look as follows:
|
42 |
+
\n[PB] Title of Page 1
|
43 |
+
\n[T] Point 1
|
44 |
+
\n[T][T] Subpoint 1
|
45 |
+
\n[T][T] Subpoint 2
|
46 |
+
\n[T] Point 2
|
47 |
+
\n[T] Point 3
|
48 |
+
\n[PE]
|
49 |
+
/.. /
|
50 |
+
/.'
|
51 |
+
.'
|
52 |
+
.'
|
53 |
+
|
54 |
+
\n\n[PB] Title of Page 2
|
55 |
+
\n[T] Point 4
|
56 |
+
\n[T][T] Subpoint 3
|
57 |
+
\n[T] Point 5
|
58 |
+
\n[T] Point 6
|
59 |
+
\n[T][T] Subpoint 4
|
60 |
+
\n[PE]
|
61 |
+
|
62 |
+
\n\n... # more pages to follow
|
63 |
+
|
64 |
+
[assistant]
|
65 |
+
Great! Please provide me with the current draft of the slides.
|
66 |
+
|
67 |
+
|
68 |
+
[user]
|
69 |
+
Here is the current draft of the slides:
|
70 |
+
\n\n[data_tag_0]
|
core/chatgpt/generate_slides.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import openai
|
7 |
+
import pandas as pd
|
8 |
+
import spacy
|
9 |
+
import tqdm
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
from .utils import get_num_tokens, parse_prompt, num_tokens_from_messages, clean_slides, slide_generation_ver2, generate_latex_slide
|
13 |
+
|
14 |
+
nlp = spacy.load('en_core_web_sm')
|
15 |
+
|
16 |
+
|
17 |
+
def set_openai_api_key(key: str):
|
18 |
+
openai.api_key = 'key'
|
19 |
+
|
20 |
+
|
21 |
+
def generate_slide(json_pth: str):
|
22 |
+
|
23 |
+
model_list = [model['id'] for model in openai.Model.list()['data']]
|
24 |
+
gpt4_id = "gpt-4-0314"
|
25 |
+
gpt3_id = 'gpt-3.5-turbo-0301'
|
26 |
+
|
27 |
+
with open(json_pth) as f:
|
28 |
+
data = json.load(f)
|
29 |
+
|
30 |
+
title = data['title']
|
31 |
+
abstract = data['abstract']
|
32 |
+
paper_length = len(data['text'])
|
33 |
+
sections = [[head['section'], ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))])] for head in data['headers']]
|
34 |
+
figures = [fig['caption'] for fig in data['figures']]
|
35 |
+
|
36 |
+
### ! Split the sections by chunks with token_limit
|
37 |
+
new_sections = []
|
38 |
+
toc = ""
|
39 |
+
token_limit = 1400
|
40 |
+
|
41 |
+
for section in sections:
|
42 |
+
section_title = section[0]
|
43 |
+
curr_count = get_num_tokens(section[1])
|
44 |
+
|
45 |
+
toc += section_title + "; "
|
46 |
+
|
47 |
+
if curr_count > token_limit:
|
48 |
+
# split the section into sentences
|
49 |
+
sents = nlp(section[1]).sents
|
50 |
+
|
51 |
+
temp_list = []
|
52 |
+
for sent in sents:
|
53 |
+
if not temp_list:
|
54 |
+
temp_list.append(sent.text)
|
55 |
+
continue
|
56 |
+
curr_count = get_num_tokens(temp_list[-1])
|
57 |
+
if curr_count + get_num_tokens(sent.text) < token_limit:
|
58 |
+
temp_list[-1] += sent.text
|
59 |
+
else:
|
60 |
+
temp_list.append(sent.text)
|
61 |
+
|
62 |
+
for i in range(len(temp_list)):
|
63 |
+
if i == 0:
|
64 |
+
new_sections.append([section_title, temp_list[i]])
|
65 |
+
else:
|
66 |
+
new_sections.append([section_title + " (cont.)", temp_list[i]])
|
67 |
+
else:
|
68 |
+
new_sections.append(section)
|
69 |
+
|
70 |
+
print(f"Total number of sections: {len(new_sections)}")
|
71 |
+
|
72 |
+
# ! get the initial message
|
73 |
+
initial_user_message = "Title: " + title + "\nTable of Contents: " + toc + "\nAbstract: " + abstract
|
74 |
+
initial_section_title = new_sections[0][0]
|
75 |
+
initial_section_content = new_sections[0][1]
|
76 |
+
|
77 |
+
# ! initial dialogue, Generates slides for the first section of the research paper.
|
78 |
+
res = []
|
79 |
+
data = [initial_user_message, initial_section_title, initial_section_content]
|
80 |
+
messages = parse_prompt("./dialogue_1.txt", data)
|
81 |
+
token_length = num_tokens_from_messages(messages)
|
82 |
+
|
83 |
+
assert token_length < 2400, f"Message is too long: {token_length}"
|
84 |
+
|
85 |
+
response = openai.ChatCompletion.create(
|
86 |
+
model=gpt3_id,
|
87 |
+
messages=messages,
|
88 |
+
temperature=0.5,
|
89 |
+
)
|
90 |
+
answer = response["choices"][0]["message"]["content"]
|
91 |
+
res.append(answer)
|
92 |
+
time.sleep(10)
|
93 |
+
|
94 |
+
### ! Following dialogue. Generates slides for the following sections of the research paper.
|
95 |
+
for i, (section_title, section_content) in enumerate(new_sections[1:]):
|
96 |
+
print(f"Section {i+1}: {section_title} is being processed...")
|
97 |
+
|
98 |
+
data = [section_content]
|
99 |
+
messages = parse_prompt("./dialogue_2.txt", data)
|
100 |
+
|
101 |
+
token_length = num_tokens_from_messages(messages)
|
102 |
+
assert token_length < 2400, f"Message is too long: {token_length}"
|
103 |
+
|
104 |
+
response = openai.ChatCompletion.create(
|
105 |
+
model=gpt3_id,
|
106 |
+
messages=messages,
|
107 |
+
temperature=0.9,
|
108 |
+
)
|
109 |
+
answer = response["choices"][0]["message"]["content"]
|
110 |
+
res.append(answer)
|
111 |
+
|
112 |
+
del messages, token_length, response, answer
|
113 |
+
time.sleep(10) # sleep for 10 seconds to avoid API limit
|
114 |
+
|
115 |
+
### ! Clean slides from comments, empty lines and other garbage
|
116 |
+
for i in range(len(res)):
|
117 |
+
res[i] = clean_slides(res[i])
|
118 |
+
|
119 |
+
temp_res = res
|
120 |
+
prev_cnt = len(temp_res)
|
121 |
+
|
122 |
+
while len(temp_res) > 1:
|
123 |
+
temp_num_tokens = get_num_tokens("\n".join(temp_res))
|
124 |
+
temp_res = slide_generation_ver2(temp_res, 1800)
|
125 |
+
print(f"The length of res is {len(temp_res)}, and the number of tokens is {temp_num_tokens}")
|
126 |
+
|
127 |
+
# if the number of slides is not changed then break
|
128 |
+
if len(temp_res) == prev_cnt:
|
129 |
+
break
|
130 |
+
else:
|
131 |
+
prev_cnt = len(temp_res)
|
132 |
+
|
133 |
+
# if the number of tokens is less than 4000 then break
|
134 |
+
if temp_num_tokens <= 4000:
|
135 |
+
break
|
136 |
+
|
137 |
+
new_res = []
|
138 |
+
for i in tqdm(range(len(temp_res))):
|
139 |
+
data = [temp_res[i]]
|
140 |
+
messages = parse_prompt("./dialogue_3.txt", data)
|
141 |
+
|
142 |
+
token_length = num_tokens_from_messages(messages)
|
143 |
+
assert token_length < 2400, f"Message is too long: {token_length}"
|
144 |
+
|
145 |
+
response = openai.ChatCompletion.create(
|
146 |
+
model=gpt3_id,
|
147 |
+
messages=messages,
|
148 |
+
temperature=0.9,
|
149 |
+
)
|
150 |
+
|
151 |
+
temp = response["choices"][0]["message"]["content"]
|
152 |
+
temp = clean_slides(temp)
|
153 |
+
|
154 |
+
new_res.append(temp)
|
155 |
+
time.sleep(5) # needed to avoid API limit
|
156 |
+
|
157 |
+
temp_res = new_res
|
158 |
+
|
159 |
+
time.sleep(10) # needed to avoid API limit
|
160 |
+
|
161 |
+
# ! final refinement
|
162 |
+
final_draft = "\n".join(temp_res)
|
163 |
+
data = [final_draft]
|
164 |
+
messages = parse_prompt("./dialogue_4.txt", data)
|
165 |
+
|
166 |
+
print(num_tokens_from_messages(messages))
|
167 |
+
|
168 |
+
response = openai.ChatCompletion.create(
|
169 |
+
model=gpt4_id if gpt4_id in model_list else gpt3_id,
|
170 |
+
messages=messages,
|
171 |
+
temperature=0.5,
|
172 |
+
)
|
173 |
+
|
174 |
+
temp = response["choices"][0]["message"]["content"]
|
175 |
+
|
176 |
+
# generate_latex_slide(temp, "test.tex")
|
177 |
+
|
178 |
+
return temp
|
core/chatgpt/utils.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tiktoken
|
2 |
+
|
3 |
+
from typing import Dict, Tuple, List
|
4 |
+
|
5 |
+
|
6 |
+
def slide_generation(res, num_tokens_limit=1800):
|
7 |
+
new_res = [res[0]]
|
8 |
+
|
9 |
+
for i in range(1, len(res)):
|
10 |
+
if not res[i]:
|
11 |
+
continue
|
12 |
+
prev_cnt = get_num_tokens(new_res[-1])
|
13 |
+
curr_cnt = get_num_tokens(res[i])
|
14 |
+
if prev_cnt + curr_cnt < num_tokens_limit:
|
15 |
+
new_res[-1] += res[i]
|
16 |
+
else:
|
17 |
+
new_res.append(res[i])
|
18 |
+
return new_res
|
19 |
+
|
20 |
+
|
21 |
+
def slide_generation_ver2(res, num_tokens_limit=1800):
|
22 |
+
text = "\n".join(res).split("[PE]")
|
23 |
+
|
24 |
+
text = [(t.strip() + "\n[PE]\n") if t else "" for t in text]
|
25 |
+
return slide_generation(text, num_tokens_limit=num_tokens_limit)
|
26 |
+
|
27 |
+
|
28 |
+
def parse_prompt(file: str, data: List[str] = None):
|
29 |
+
roles = []
|
30 |
+
contents = []
|
31 |
+
|
32 |
+
file = open(file, "r")
|
33 |
+
for line in file.readlines():
|
34 |
+
# if line is empty or a comment, skip
|
35 |
+
if "#" in line or not line.strip():
|
36 |
+
continue
|
37 |
+
if "[user]" in line:
|
38 |
+
roles.append("user")
|
39 |
+
contents.append([])
|
40 |
+
continue
|
41 |
+
elif "[assistant]" in line:
|
42 |
+
roles.append("assistant")
|
43 |
+
contents.append([])
|
44 |
+
continue
|
45 |
+
elif "[system]" in line:
|
46 |
+
roles.append("system")
|
47 |
+
contents.append([])
|
48 |
+
continue
|
49 |
+
if line.strip():
|
50 |
+
assert roles, "No role specified"
|
51 |
+
contents[-1].append(line.strip())
|
52 |
+
|
53 |
+
# checking roles
|
54 |
+
assert roles[0] in ["user", "system"], "First role must be user or system"
|
55 |
+
for i in range(1, len(roles)):
|
56 |
+
assert roles[i] in ["user", "assistant"], "Roles must be user or assistant"
|
57 |
+
assert roles[i] != roles[i - 1], "Roles must alternate between user and assistant"
|
58 |
+
|
59 |
+
contents_str = []
|
60 |
+
for content in contents:
|
61 |
+
contents_str.append(" ".join(content))
|
62 |
+
|
63 |
+
curr_idx = 0
|
64 |
+
for i in range(len(contents_str)):
|
65 |
+
tag = f"[data_tag_{curr_idx}]"
|
66 |
+
# replace \n with newline
|
67 |
+
contents_str[i] = contents_str[i].replace("\\n", "\n")
|
68 |
+
if tag in contents_str[i]:
|
69 |
+
contents_str[i] = contents_str[i].replace(tag, data[curr_idx])
|
70 |
+
curr_idx += 1
|
71 |
+
assert curr_idx == len(data), "Not all data tags were replaced"
|
72 |
+
|
73 |
+
messages = []
|
74 |
+
for i in range(len(roles)):
|
75 |
+
messages.append({"role": roles[i], "content": contents_str[i]})
|
76 |
+
|
77 |
+
return messages
|
78 |
+
|
79 |
+
|
80 |
+
def clean_slides(slide):
|
81 |
+
slide_list = slide.split('\n')
|
82 |
+
clean_slide_list = []
|
83 |
+
for line in slide_list:
|
84 |
+
if line[:3] == '[F]' or line[:3] == '[T]' or line[:6] == '[T][T]' or line[:4] == '[PB]' or line[:4] == '[PE]':
|
85 |
+
clean_slide_list.append(line)
|
86 |
+
return '\n'.join(clean_slide_list)
|
87 |
+
|
88 |
+
|
89 |
+
def generate_latex_slide(slide, output_path=None):
|
90 |
+
# Initialize the Beamer document
|
91 |
+
latex_code = "\\documentclass{beamer} \n\\begin{document}"
|
92 |
+
|
93 |
+
# Split the slide string into pages
|
94 |
+
pages = slide.split('[PB]')[1:]
|
95 |
+
|
96 |
+
# Iterate through each page
|
97 |
+
for i, page in enumerate(pages):
|
98 |
+
tmp_list = [None, None] # [title, content]
|
99 |
+
|
100 |
+
page = page.strip()
|
101 |
+
|
102 |
+
print(i, page)
|
103 |
+
|
104 |
+
# Extract the page title and content
|
105 |
+
title_end_index = page.index("\n") + 1
|
106 |
+
title = page[:title_end_index].strip()
|
107 |
+
content_end_index = page.index("[PE]")
|
108 |
+
content = page[title_end_index:content_end_index].strip()
|
109 |
+
|
110 |
+
# Start a new frame with the page title
|
111 |
+
if title:
|
112 |
+
tmp_list[0] = f"\n\\begin{{frame}}{{{title}}}\n\n"
|
113 |
+
|
114 |
+
# Split the content into list items
|
115 |
+
items = content.split('\n')
|
116 |
+
|
117 |
+
p = []
|
118 |
+
for item in items:
|
119 |
+
if not item:
|
120 |
+
break
|
121 |
+
# print(item)
|
122 |
+
if '[T][T]' in item:
|
123 |
+
assert len(p) > 0, "Subpoint cannot be the first item in a page"
|
124 |
+
subpoints = item.split('[T][T]')[1]
|
125 |
+
p[-1].append(subpoints)
|
126 |
+
else:
|
127 |
+
if '[T]' in item:
|
128 |
+
point = item.split('[T]')[1]
|
129 |
+
else:
|
130 |
+
point = item
|
131 |
+
p.append([point])
|
132 |
+
|
133 |
+
if p:
|
134 |
+
# Add each item as a Beamer itemize element
|
135 |
+
tmp_list[1] = "\\begin{itemize}\n"
|
136 |
+
for point in p:
|
137 |
+
if not point:
|
138 |
+
break
|
139 |
+
tmp_list[1] += f"\\item {point[0]}\n"
|
140 |
+
if len(point) > 1:
|
141 |
+
tmp_list[1] += "\\begin{itemize}\n"
|
142 |
+
for subpoint in point[1:]:
|
143 |
+
tmp_list[1] += f"\\item {subpoint}\n"
|
144 |
+
tmp_list[1] += "\\end{itemize}\n"
|
145 |
+
tmp_list[1] += "\\end{itemize}\n"
|
146 |
+
|
147 |
+
if tmp_list[0] is None and tmp_list[1] is None:
|
148 |
+
# The page is empty, so skip it
|
149 |
+
if i == len(pages) - 1:
|
150 |
+
# This is the last page, so end the document instead of the frame
|
151 |
+
latex_code += "\n\\end{document}"
|
152 |
+
break
|
153 |
+
|
154 |
+
tmp_list[1] += "\n\\end{frame}\n"
|
155 |
+
# End the frame
|
156 |
+
if i == len(pages) - 1:
|
157 |
+
# This is the last page, so end the document instead of the frame
|
158 |
+
tmp_list[1] += "\n\\end{document}"
|
159 |
+
|
160 |
+
latex_code += "".join(tmp_list)
|
161 |
+
|
162 |
+
latex_code = latex_code.replace('_', '\_').replace('&', '\&').replace('^', '\^').replace('$', '\$')
|
163 |
+
if output_path:
|
164 |
+
with open(output_path, 'w') as f:
|
165 |
+
f.write(latex_code)
|
166 |
+
|
167 |
+
|
168 |
+
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
|
169 |
+
"""
|
170 |
+
Returns the number of tokens required to encode the given messages.
|
171 |
+
|
172 |
+
source: https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/chatgpt?pivots=programming-language-chat-completions#managing-conversations
|
173 |
+
"""
|
174 |
+
encoding = tiktoken.encoding_for_model(model)
|
175 |
+
num_tokens = 0
|
176 |
+
for message in messages:
|
177 |
+
num_tokens += 4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
|
178 |
+
for key, value in message.items():
|
179 |
+
num_tokens += len(encoding.encode(value))
|
180 |
+
if key == "name": # if there's a name, the role is omitted
|
181 |
+
num_tokens += -1 # role is always required and always 1 token
|
182 |
+
num_tokens += 2 # every reply is primed with <im_start>assistant
|
183 |
+
return num_tokens
|
184 |
+
|
185 |
+
|
186 |
+
def get_num_tokens(message, model="gpt-3.5-turbo-0301"):
|
187 |
+
encoding = tiktoken.encoding_for_model(model)
|
188 |
+
num_tokens = 0
|
189 |
+
num_tokens += len(encoding.encode(message))
|
190 |
+
return num_tokens
|
191 |
+
|
192 |
+
|
193 |
+
def get_paper_text_in_chunks(example, chunk_size=4000):
|
194 |
+
paper_length = len(example['paper']['text'])
|
195 |
+
|
196 |
+
title = '[TB] ' + example['title'] + ' [TE] '
|
197 |
+
abstract = '[AB] ' + example['paper']['abstract'] + ' [AE] '
|
198 |
+
|
199 |
+
sections = [' [SB] ' + head['n'] + ' ' + head['section'] + ' [SC] ' + ' '.join([example['paper']['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]) + ' [SE] ' for head in example['paper']['headers']]
|
200 |
+
figures = [' [FB] ' + fig['caption'] + ' [FE] ' for fig in example['paper']['figures']]
|
201 |
+
|
202 |
+
chunks = []
|
203 |
+
|
204 |
+
temp_chunk = title + abstract
|
205 |
+
temp_chunk_length = get_num_tokens(temp_chunk)
|
206 |
+
|
207 |
+
for s in sections + figures:
|
208 |
+
assert get_num_tokens(s) < chunk_size, "Section or figure is too long to fit in a chunk"
|
209 |
+
if temp_chunk_length + get_num_tokens(s) > chunk_size:
|
210 |
+
chunks.append(temp_chunk)
|
211 |
+
temp_chunk = s
|
212 |
+
temp_chunk_length = get_num_tokens(s)
|
213 |
+
else:
|
214 |
+
temp_chunk += s
|
215 |
+
temp_chunk_length += get_num_tokens(s)
|
216 |
+
|
217 |
+
if temp_chunk_length > 0:
|
218 |
+
chunks.append(temp_chunk)
|
219 |
+
|
220 |
+
return chunks
|
core/tei.py
CHANGED
@@ -99,13 +99,15 @@ class TEIFile(object):
|
|
99 |
if head.parent.name == 'div':
|
100 |
txt = head.parent.get_text(separator=' ', strip=True)
|
101 |
# the following is only valid for arabic numerals...
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
109 |
start = 0
|
110 |
for i in sections:
|
111 |
sent = nltk.tokenize.sent_tokenize(i[2])
|
|
|
99 |
if head.parent.name == 'div':
|
100 |
txt = head.parent.get_text(separator=' ', strip=True)
|
101 |
# the following is only valid for arabic numerals...
|
102 |
+
try:
|
103 |
+
if head.get("n"):
|
104 |
+
sections.append([head.text, head.get('n'), txt])
|
105 |
+
else:
|
106 |
+
if len(sections) == 0:
|
107 |
+
print("Grobid processing error.")
|
108 |
+
sections[-1][2] += txt
|
109 |
+
except:
|
110 |
+
sections.append([head.text, 'invalid n', txt])
|
111 |
start = 0
|
112 |
for i in sections:
|
113 |
sent = nltk.tokenize.sent_tokenize(i[2])
|
requirements.txt
CHANGED
@@ -7,6 +7,7 @@ lxml
|
|
7 |
# for interaction with openai
|
8 |
openai
|
9 |
tiktoken
|
|
|
10 |
# for shiny
|
11 |
anyio==3.6.2
|
12 |
appdirs==1.4.4
|
|
|
7 |
# for interaction with openai
|
8 |
openai
|
9 |
tiktoken
|
10 |
+
spacy
|
11 |
# for shiny
|
12 |
anyio==3.6.2
|
13 |
appdirs==1.4.4
|
shiny_example_dockerfile
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
FROM python:3.9
|
2 |
-
|
3 |
-
WORKDIR /code
|
4 |
-
|
5 |
-
COPY ./requirements.txt /code/requirements.txt
|
6 |
-
|
7 |
-
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
-
|
9 |
-
COPY . .
|
10 |
-
|
11 |
-
EXPOSE 7860
|
12 |
-
|
13 |
-
CMD ["shiny", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|