Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- app.py +136 -0
- get_paper_from_pdf.py +193 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import datetime
|
5 |
+
import time
|
6 |
+
import openai, tenacity
|
7 |
+
import argparse
|
8 |
+
import configparser
|
9 |
+
import json
|
10 |
+
import tiktoken
|
11 |
+
from get_paper_from_pdf import Paper
|
12 |
+
import gradio
|
13 |
+
|
14 |
+
# 定义Response类
|
15 |
+
class Response:
|
16 |
+
# 初始化方法,设置属性
|
17 |
+
def __init__(self, api, comment, language):
|
18 |
+
self.api = api
|
19 |
+
self.comment = comment
|
20 |
+
self.language = language
|
21 |
+
self.max_token_num = 4096
|
22 |
+
self.encoding = tiktoken.get_encoding("gpt2")
|
23 |
+
|
24 |
+
|
25 |
+
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
26 |
+
stop=tenacity.stop_after_attempt(5),
|
27 |
+
reraise=True)
|
28 |
+
def chat_response(self, text):
|
29 |
+
openai.api_key = self.chat_api_list[self.cur_api]
|
30 |
+
self.cur_api += 1
|
31 |
+
self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
|
32 |
+
response_prompt_token = 1000
|
33 |
+
text_token = len(self.encoding.encode(self.comment))
|
34 |
+
input_text_index = int(len(text)*(self.max_token_num-response_prompt_token)/text_token)
|
35 |
+
input_text = "This is the review comments:" + text[:input_text_index]
|
36 |
+
messages=[
|
37 |
+
{"role": "system", "content": """You are the author, you submitted a paper, and the reviewers gave the review comments.
|
38 |
+
Please reply with what we have done, not what we will do.
|
39 |
+
You need to extract questions from the review comments one by one, and then respond point-to-point to the reviewers’ concerns.
|
40 |
+
Please answer in {}. Follow the format of the output later:
|
41 |
+
- Response to reviewers
|
42 |
+
#1 reviewer
|
43 |
+
Concern #1: xxxx
|
44 |
+
Author response: xxxxx
|
45 |
+
Concern #2: xxxx
|
46 |
+
Author response: xxxxx
|
47 |
+
...
|
48 |
+
#2 reviewer
|
49 |
+
Concern #1: xxxx
|
50 |
+
Author response: xxxxx
|
51 |
+
Concern #2: xxxx
|
52 |
+
Author response: xxxxx
|
53 |
+
...
|
54 |
+
#3 reviewer
|
55 |
+
Concern #1: xxxx
|
56 |
+
Author response: xxxxx
|
57 |
+
Concern #2: xxxx
|
58 |
+
Author response: xxxxx
|
59 |
+
...
|
60 |
+
|
61 |
+
""".format(self.language)
|
62 |
+
|
63 |
+
},
|
64 |
+
{"role": "user", "content": input_text},
|
65 |
+
]
|
66 |
+
|
67 |
+
response = openai.ChatCompletion.create(
|
68 |
+
model="gpt-3.5-turbo",
|
69 |
+
messages=messages,
|
70 |
+
)
|
71 |
+
result = ''
|
72 |
+
for choice in response.choices:
|
73 |
+
result += choice.message.content
|
74 |
+
print("********"*10)
|
75 |
+
print(result)
|
76 |
+
print("********"*10)
|
77 |
+
print("prompt_token_used:", response.usage.prompt_tokens)
|
78 |
+
print("completion_token_used:", response.usage.completion_tokens)
|
79 |
+
print("total_token_used:", response.usage.total_tokens)
|
80 |
+
print("response_time:", response.response_ms/1000.0, 's')
|
81 |
+
return result, response.usage.total_tokens
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
def main(api, comment, language):
|
86 |
+
start_time = time.time()
|
87 |
+
if not api or not comment:
|
88 |
+
return "请输入API-key以及审稿意见!"
|
89 |
+
else:
|
90 |
+
Response1 = Response(api, comment, language)
|
91 |
+
# 开始判断是路径还是文件:
|
92 |
+
response, total_token_used = Response1.chat_response()
|
93 |
+
time_used = time.time() - start_time
|
94 |
+
output2 ="使用token数:"+ str(total_token_used)+"\n花费时间:"+ str(round(time_used, 2)) +"秒"
|
95 |
+
return response, output2
|
96 |
+
|
97 |
+
|
98 |
+
########################################################################################################
|
99 |
+
# 标题
|
100 |
+
title = "🤖ChatResponse🤖"
|
101 |
+
# 描述
|
102 |
+
|
103 |
+
description = '''<div align='left'>
|
104 |
+
<img align='right' src='http://i.imgtg.com/2023/03/22/94PLN.png' width="250">
|
105 |
+
|
106 |
+
<strong>ChatResponse是一款根据审稿人的评论自动生成作者回复的AI助手。用途如下:</strong>其用途为:
|
107 |
+
|
108 |
+
⭐️根据输入的审稿意见,ChatResponse会自动提取其中各个审稿人的问题和担忧,并生成点对点的回复。
|
109 |
+
|
110 |
+
如果觉得很卡,可以点击右上角的Duplicate this Space,把ChatResponse复制到你自己的Space中!
|
111 |
+
|
112 |
+
本项目的[Github](https://github.com/nishiwen1214/ChatResponse),欢迎Star和Fork,也欢迎大佬赞助让本项目快速成长!💗([获取Api Key](https://chatgpt.cn.obiscr.com/blog/posts/2023/How-to-get-api-key/))
|
113 |
+
</div>
|
114 |
+
'''
|
115 |
+
|
116 |
+
# 创建Gradio界面
|
117 |
+
inp = [gradio.inputs.Textbox(label="请输入你的API-key(sk开头的字符串)",
|
118 |
+
default="",
|
119 |
+
type='password'),
|
120 |
+
gradio.inputs.Textbox(lines=5,
|
121 |
+
label="请输入要回复的审稿意见",
|
122 |
+
default=""
|
123 |
+
),
|
124 |
+
gradio.inputs.Radio(choices=["English", "Chinese"],
|
125 |
+
default="English",
|
126 |
+
label="选择输��语言"),
|
127 |
+
]
|
128 |
+
|
129 |
+
chat_Response_gui = gradio.Interface(fn=main,
|
130 |
+
inputs=inp,
|
131 |
+
outputs = [gradio.Textbox(lines=20, label="回复结果"), gradio.Textbox(lines=2, label="资源统计")],
|
132 |
+
title=title,
|
133 |
+
description=description)
|
134 |
+
|
135 |
+
# Start server
|
136 |
+
chat_Response_gui .launch(quiet=True, show_api=False)
|
get_paper_from_pdf.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz, io, os
|
2 |
+
from PIL import Image
|
3 |
+
from collections import Counter
|
4 |
+
import json
|
5 |
+
import re
|
6 |
+
|
7 |
+
class Paper:
|
8 |
+
def __init__(self, path, title='', url='', abs='', authors=[]):
|
9 |
+
# 初始化函数,根据pdf路径初始化Paper对象
|
10 |
+
self.url = url # 文章链接
|
11 |
+
self.path = path # pdf路径
|
12 |
+
self.section_names = [] # 段落标题
|
13 |
+
self.section_texts = {} # 段落内容
|
14 |
+
self.abs = abs
|
15 |
+
self.title_page = 0
|
16 |
+
if title == '':
|
17 |
+
self.pdf = fitz.open(self.path) # pdf文档
|
18 |
+
self.title = self.get_title()
|
19 |
+
self.parse_pdf()
|
20 |
+
else:
|
21 |
+
self.title = title
|
22 |
+
self.authors = authors
|
23 |
+
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
|
24 |
+
self.digit_num = [str(d + 1) for d in range(10)]
|
25 |
+
self.first_image = ''
|
26 |
+
|
27 |
+
def parse_pdf(self):
|
28 |
+
self.pdf = fitz.open(self.path) # pdf文档
|
29 |
+
self.text_list = [page.get_text() for page in self.pdf]
|
30 |
+
self.all_text = ' '.join(self.text_list)
|
31 |
+
self.extract_section_infomation()
|
32 |
+
self.section_texts.update({"title": self.title})
|
33 |
+
self.pdf.close()
|
34 |
+
|
35 |
+
# 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
|
36 |
+
def get_chapter_names(self, ):
|
37 |
+
# # 打开一个pdf文件
|
38 |
+
doc = fitz.open(self.path) # pdf文档
|
39 |
+
text_list = [page.get_text() for page in doc]
|
40 |
+
all_text = ''
|
41 |
+
for text in text_list:
|
42 |
+
all_text += text
|
43 |
+
# # 创建一个空列表,用于存储章节名称
|
44 |
+
chapter_names = []
|
45 |
+
for line in all_text.split('\n'):
|
46 |
+
line_list = line.split(' ')
|
47 |
+
if '.' in line:
|
48 |
+
point_split_list = line.split('.')
|
49 |
+
space_split_list = line.split(' ')
|
50 |
+
if 1 < len(space_split_list) < 5:
|
51 |
+
if 1 < len(point_split_list) < 5 and (
|
52 |
+
point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
|
53 |
+
# print("line:", line)
|
54 |
+
chapter_names.append(line)
|
55 |
+
|
56 |
+
return chapter_names
|
57 |
+
|
58 |
+
def get_title(self):
|
59 |
+
doc = self.pdf # 打开pdf文件
|
60 |
+
max_font_size = 0 # 初始化最大字体大小为0
|
61 |
+
max_string = "" # 初始化最大字体大小对应的字符串为空
|
62 |
+
max_font_sizes = [0]
|
63 |
+
for page_index, page in enumerate(doc): # 遍历每一页
|
64 |
+
text = page.get_text("dict") # 获取页面上的文本信息
|
65 |
+
blocks = text["blocks"] # 获取文本块列表
|
66 |
+
for block in blocks: # 遍历每个文本块
|
67 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
68 |
+
if len(block["lines"][0]["spans"]):
|
69 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
70 |
+
max_font_sizes.append(font_size)
|
71 |
+
if font_size > max_font_size: # 如果字体大小大于当前最大值
|
72 |
+
max_font_size = font_size # 更新最大值
|
73 |
+
max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
74 |
+
max_font_sizes.sort()
|
75 |
+
# print("max_font_sizes", max_font_sizes[-10:])
|
76 |
+
cur_title = ''
|
77 |
+
for page_index, page in enumerate(doc): # 遍历每一页
|
78 |
+
text = page.get_text("dict") # 获取页面上的文本信息
|
79 |
+
blocks = text["blocks"] # 获取文本块列表
|
80 |
+
for block in blocks: # 遍历每个文本块
|
81 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
82 |
+
if len(block["lines"][0]["spans"]):
|
83 |
+
cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
84 |
+
font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
|
85 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
86 |
+
# print(font_size)
|
87 |
+
if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
|
88 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
89 |
+
if len(cur_string) > 4 and "arXiv" not in cur_string:
|
90 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
91 |
+
if cur_title == '':
|
92 |
+
cur_title += cur_string
|
93 |
+
else:
|
94 |
+
cur_title += ' ' + cur_string
|
95 |
+
self.title_page = page_index
|
96 |
+
# break
|
97 |
+
title = cur_title.replace('\n', ' ')
|
98 |
+
return title
|
99 |
+
|
100 |
+
def extract_section_infomation(self):
|
101 |
+
doc = fitz.open(self.path)
|
102 |
+
|
103 |
+
# 获取文档中所有字体大小
|
104 |
+
font_sizes = []
|
105 |
+
for page in doc:
|
106 |
+
blocks = page.get_text("dict")["blocks"]
|
107 |
+
for block in blocks:
|
108 |
+
if 'lines' not in block:
|
109 |
+
continue
|
110 |
+
lines = block["lines"]
|
111 |
+
for line in lines:
|
112 |
+
for span in line["spans"]:
|
113 |
+
font_sizes.append(span["size"])
|
114 |
+
most_common_size, _ = Counter(font_sizes).most_common(1)[0]
|
115 |
+
|
116 |
+
# 按照最频繁的字体大小确定标题字体大小的阈值
|
117 |
+
threshold = most_common_size * 1
|
118 |
+
|
119 |
+
section_dict = {}
|
120 |
+
last_heading = None
|
121 |
+
subheadings = []
|
122 |
+
heading_font = -1
|
123 |
+
# 遍历每一页并查找子标题
|
124 |
+
found_abstract = False
|
125 |
+
upper_heading = False
|
126 |
+
font_heading = False
|
127 |
+
for page in doc:
|
128 |
+
blocks = page.get_text("dict")["blocks"]
|
129 |
+
for block in blocks:
|
130 |
+
if not found_abstract:
|
131 |
+
try:
|
132 |
+
text = json.dumps(block)
|
133 |
+
except:
|
134 |
+
continue
|
135 |
+
if re.search(r"\bAbstract\b", text, re.IGNORECASE):
|
136 |
+
found_abstract = True
|
137 |
+
last_heading = "Abstract"
|
138 |
+
section_dict["Abstract"] = ""
|
139 |
+
if found_abstract:
|
140 |
+
if 'lines' not in block:
|
141 |
+
continue
|
142 |
+
lines = block["lines"]
|
143 |
+
for line in lines:
|
144 |
+
for span in line["spans"]:
|
145 |
+
# 如果当前文本是子标题
|
146 |
+
if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4: # 针对一些标题大小一样,但是全大写的论文
|
147 |
+
upper_heading = True
|
148 |
+
heading = span["text"].strip()
|
149 |
+
if "References" in heading: # reference 以后的内容不考虑
|
150 |
+
self.section_names = subheadings
|
151 |
+
self.section_texts = section_dict
|
152 |
+
return
|
153 |
+
subheadings.append(heading)
|
154 |
+
if last_heading is not None:
|
155 |
+
section_dict[last_heading] = section_dict[last_heading].strip()
|
156 |
+
section_dict[heading] = ""
|
157 |
+
last_heading = heading
|
158 |
+
if not upper_heading and span["size"] > threshold and re.match( # 正常情况下,通过字体大小判断
|
159 |
+
r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
|
160 |
+
span["text"].strip()):
|
161 |
+
font_heading = True
|
162 |
+
if heading_font == -1:
|
163 |
+
heading_font = span["size"]
|
164 |
+
elif heading_font != span["size"]:
|
165 |
+
continue
|
166 |
+
heading = span["text"].strip()
|
167 |
+
if "References" in heading: # reference 以后的内容不考虑
|
168 |
+
self.section_names = subheadings
|
169 |
+
self.section_texts = section_dict
|
170 |
+
return
|
171 |
+
subheadings.append(heading)
|
172 |
+
if last_heading is not None:
|
173 |
+
section_dict[last_heading] = section_dict[last_heading].strip()
|
174 |
+
section_dict[heading] = ""
|
175 |
+
last_heading = heading
|
176 |
+
# 否则将当前文本添加到上一个子标题的文本中
|
177 |
+
elif last_heading is not None:
|
178 |
+
section_dict[last_heading] += " " + span["text"].strip()
|
179 |
+
self.section_names = subheadings
|
180 |
+
self.section_texts = section_dict
|
181 |
+
|
182 |
+
|
183 |
+
def main():
|
184 |
+
path = r'demo.pdf'
|
185 |
+
paper = Paper(path=path)
|
186 |
+
paper.parse_pdf()
|
187 |
+
# for key, value in paper.section_text_dict.items():
|
188 |
+
# print(key, value)
|
189 |
+
# print("*"*40)
|
190 |
+
|
191 |
+
|
192 |
+
if __name__ == '__main__':
|
193 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PyMuPDF==1.21.1
|
2 |
+
tiktoken==0.2.0
|
3 |
+
tenacity==8.2.2
|
4 |
+
pybase64==1.2.3
|
5 |
+
Pillow==9.4.0
|
6 |
+
openai==0.27.0
|
7 |
+
markdown
|
8 |
+
gradio==3.20.1
|