Spaces:
Running
Running
加入 arxiv 小助手插件
Browse files- crazy_functions/下载arxiv论文翻译摘要.py +186 -0
- functional_crazy.py +33 -27
- main.py +1 -1
crazy_functions/下载arxiv论文翻译摘要.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from predict import predict_no_ui
|
2 |
+
from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down, get_conf
|
3 |
+
import re, requests, unicodedata, os
|
4 |
+
|
5 |
+
def download_arxiv_(url_pdf):
|
6 |
+
if 'arxiv.org' not in url_pdf:
|
7 |
+
if ('.' in url_pdf) and ('/' not in url_pdf):
|
8 |
+
new_url = 'https://arxiv.org/abs/'+url_pdf
|
9 |
+
print('下载编号:', url_pdf, '自动定位:', new_url)
|
10 |
+
# download_arxiv_(new_url)
|
11 |
+
return download_arxiv_(new_url)
|
12 |
+
else:
|
13 |
+
print('不能识别的URL!')
|
14 |
+
return None
|
15 |
+
if 'abs' in url_pdf:
|
16 |
+
url_pdf = url_pdf.replace('abs', 'pdf')
|
17 |
+
url_pdf = url_pdf + '.pdf'
|
18 |
+
|
19 |
+
url_abs = url_pdf.replace('.pdf', '').replace('pdf', 'abs')
|
20 |
+
title, other_info = get_name(_url_=url_abs)
|
21 |
+
|
22 |
+
paper_id = title.split()[0] # '[1712.00559]'
|
23 |
+
if '2' in other_info['year']:
|
24 |
+
title = other_info['year'] + ' ' + title
|
25 |
+
|
26 |
+
known_conf = ['NeurIPS', 'NIPS', 'Nature', 'Science', 'ICLR', 'AAAI']
|
27 |
+
for k in known_conf:
|
28 |
+
if k in other_info['comment']:
|
29 |
+
title = k + ' ' + title
|
30 |
+
|
31 |
+
download_dir = './gpt_log/arxiv/'
|
32 |
+
os.makedirs(download_dir, exist_ok=True)
|
33 |
+
|
34 |
+
title_str = title.replace('?', '?')\
|
35 |
+
.replace(':', ':')\
|
36 |
+
.replace('\"', '“')\
|
37 |
+
.replace('\n', '')\
|
38 |
+
.replace(' ', ' ')\
|
39 |
+
.replace(' ', ' ')
|
40 |
+
|
41 |
+
requests_pdf_url = url_pdf
|
42 |
+
file_path = download_dir+title_str
|
43 |
+
# if os.path.exists(file_path):
|
44 |
+
# print('返回缓存文件')
|
45 |
+
# return './gpt_log/arxiv/'+title_str
|
46 |
+
|
47 |
+
print('下载中')
|
48 |
+
proxies, = get_conf('proxies')
|
49 |
+
r = requests.get(requests_pdf_url, proxies=proxies)
|
50 |
+
with open(file_path, 'wb+') as f:
|
51 |
+
f.write(r.content)
|
52 |
+
print('下载完成')
|
53 |
+
|
54 |
+
# print('输出下载命令:','aria2c -o \"%s\" %s'%(title_str,url_pdf))
|
55 |
+
# subprocess.call('aria2c --all-proxy=\"172.18.116.150:11084\" -o \"%s\" %s'%(download_dir+title_str,url_pdf), shell=True)
|
56 |
+
|
57 |
+
x = "%s %s %s.bib" % (paper_id, other_info['year'], other_info['authors'])
|
58 |
+
x = x.replace('?', '?')\
|
59 |
+
.replace(':', ':')\
|
60 |
+
.replace('\"', '“')\
|
61 |
+
.replace('\n', '')\
|
62 |
+
.replace(' ', ' ')\
|
63 |
+
.replace(' ', ' ')
|
64 |
+
return './gpt_log/arxiv/'+title_str, other_info
|
65 |
+
|
66 |
+
|
67 |
+
def get_name(_url_):
|
68 |
+
import os
|
69 |
+
from bs4 import BeautifulSoup
|
70 |
+
print('正在获取文献名!')
|
71 |
+
print(_url_)
|
72 |
+
|
73 |
+
# arxiv_recall = {}
|
74 |
+
# if os.path.exists('./arxiv_recall.pkl'):
|
75 |
+
# with open('./arxiv_recall.pkl', 'rb') as f:
|
76 |
+
# arxiv_recall = pickle.load(f)
|
77 |
+
|
78 |
+
# if _url_ in arxiv_recall:
|
79 |
+
# print('在缓存中')
|
80 |
+
# return arxiv_recall[_url_]
|
81 |
+
|
82 |
+
proxies, = get_conf('proxies')
|
83 |
+
res = requests.get(_url_, proxies=proxies)
|
84 |
+
|
85 |
+
bs = BeautifulSoup(res.text, 'html.parser')
|
86 |
+
other_details = {}
|
87 |
+
|
88 |
+
# get year
|
89 |
+
try:
|
90 |
+
year = bs.find_all(class_='dateline')[0].text
|
91 |
+
year = re.search(r'(\d{4})', year, re.M | re.I).group(1)
|
92 |
+
other_details['year'] = year
|
93 |
+
abstract = bs.find_all(class_='abstract mathjax')[0].text
|
94 |
+
other_details['abstract'] = abstract
|
95 |
+
except:
|
96 |
+
other_details['year'] = ''
|
97 |
+
print('年份获取失败')
|
98 |
+
|
99 |
+
# get author
|
100 |
+
try:
|
101 |
+
authors = bs.find_all(class_='authors')[0].text
|
102 |
+
authors = authors.split('Authors:')[1]
|
103 |
+
other_details['authors'] = authors
|
104 |
+
except:
|
105 |
+
other_details['authors'] = ''
|
106 |
+
print('authors获取失败')
|
107 |
+
|
108 |
+
# get comment
|
109 |
+
try:
|
110 |
+
comment = bs.find_all(class_='metatable')[0].text
|
111 |
+
real_comment = None
|
112 |
+
for item in comment.replace('\n', ' ').split(' '):
|
113 |
+
if 'Comments' in item:
|
114 |
+
real_comment = item
|
115 |
+
if real_comment is not None:
|
116 |
+
other_details['comment'] = real_comment
|
117 |
+
else:
|
118 |
+
other_details['comment'] = ''
|
119 |
+
except:
|
120 |
+
other_details['comment'] = ''
|
121 |
+
print('年份获取失败')
|
122 |
+
|
123 |
+
title_str = BeautifulSoup(
|
124 |
+
res.text, 'html.parser').find('title').contents[0]
|
125 |
+
print('获取成功:', title_str)
|
126 |
+
# arxiv_recall[_url_] = (title_str+'.pdf', other_details)
|
127 |
+
# with open('./arxiv_recall.pkl', 'wb') as f:
|
128 |
+
# pickle.dump(arxiv_recall, f)
|
129 |
+
|
130 |
+
return title_str+'.pdf', other_details
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
@CatchException
|
135 |
+
def 下载arxiv论文并翻译摘要(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
|
136 |
+
|
137 |
+
CRAZY_FUNCTION_INFO = "下载arxiv论文并翻译摘要,函数插件作者[binary-husky]。正在提取摘要并下载PDF文档……"
|
138 |
+
import glob
|
139 |
+
import os
|
140 |
+
|
141 |
+
# 基本信息:功能、贡献者
|
142 |
+
chatbot.append(["函数插件功能?", CRAZY_FUNCTION_INFO])
|
143 |
+
yield chatbot, history, '正常'
|
144 |
+
|
145 |
+
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
146 |
+
try:
|
147 |
+
import pdfminer, bs4
|
148 |
+
except:
|
149 |
+
report_execption(chatbot, history,
|
150 |
+
a = f"解析项目: {txt}",
|
151 |
+
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pdfminer beautifulsoup4```。")
|
152 |
+
yield chatbot, history, '正常'
|
153 |
+
return
|
154 |
+
|
155 |
+
# 清空历史,以免输入溢出
|
156 |
+
history = []
|
157 |
+
|
158 |
+
# 提取摘要,下载PDF文档
|
159 |
+
try:
|
160 |
+
pdf_path, info = download_arxiv_(txt)
|
161 |
+
except:
|
162 |
+
report_execption(chatbot, history,
|
163 |
+
a = f"解析项目: {txt}",
|
164 |
+
b = f"下载pdf文件未成功")
|
165 |
+
yield chatbot, history, '正常'
|
166 |
+
return
|
167 |
+
|
168 |
+
# 翻译摘要等
|
169 |
+
i_say = f"请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。材料如下:{str(info)}"
|
170 |
+
i_say_show_user = f'请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。论文:{pdf_path}'
|
171 |
+
chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
|
172 |
+
yield chatbot, history, '正常'
|
173 |
+
msg = '正常'
|
174 |
+
# ** gpt request **
|
175 |
+
gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[]) # 带超时倒计时
|
176 |
+
chatbot[-1] = (i_say_show_user, gpt_say)
|
177 |
+
history.append(i_say_show_user); history.append(gpt_say)
|
178 |
+
yield chatbot, history, msg
|
179 |
+
# 写入文件
|
180 |
+
import shutil
|
181 |
+
# 重置文件的创建时间
|
182 |
+
shutil.copyfile(pdf_path, f'./gpt_log/{os.path.basename(pdf_path)}'); os.remove(pdf_path)
|
183 |
+
res = write_results_to_file(history)
|
184 |
+
chatbot.append(("完成了吗?", res + "\n\nPDF文件也已经下载"))
|
185 |
+
yield chatbot, history, msg
|
186 |
+
|
functional_crazy.py
CHANGED
@@ -1,13 +1,8 @@
|
|
1 |
from toolbox import HotReload # HotReload 的意思是热更新,修改函数插件后,不需要重启程序,代码直接生效
|
2 |
|
3 |
-
# UserVisibleLevel是过滤器参数。
|
4 |
-
# 由于UI界面空间有限,所以通过这种方式决定UI界面中显示哪些插件
|
5 |
-
# 默认函数插件 VisibleLevel 是 0
|
6 |
-
# 当 UserVisibleLevel >= 函数插件的 VisibleLevel 时,该函数插件才会被显示出来
|
7 |
-
UserVisibleLevel = 1
|
8 |
-
|
9 |
-
|
10 |
def get_crazy_functionals():
|
|
|
|
|
11 |
from crazy_functions.读文章写摘要 import 读文章写摘要
|
12 |
from crazy_functions.生成函数注释 import 批量生成函数注释
|
13 |
from crazy_functions.解析项目源代码 import 解析项目本身
|
@@ -52,33 +47,44 @@ def get_crazy_functionals():
|
|
52 |
"Function": HotReload(高阶功能模板函数)
|
53 |
},
|
54 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
from crazy_functions
|
60 |
-
from crazy_functions.总结word文档 import 总结word文档
|
61 |
function_plugins.update({
|
62 |
-
"
|
63 |
-
"Color": "stop",
|
64 |
-
"Function": HotReload(批量总结PDF文档) # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
65 |
-
},
|
66 |
-
"[仅供开发调试] 批量总结PDF文档pdfminer": {
|
67 |
"Color": "stop",
|
68 |
"AsButton": False, # 加入下拉菜单中
|
69 |
-
"Function": HotReload(
|
70 |
-
}
|
71 |
-
"[仅供开发调试] 批量总结Word文档": {
|
72 |
-
"Color": "stop",
|
73 |
-
"Function": HotReload(总结word文档)
|
74 |
-
},
|
75 |
})
|
|
|
|
|
|
|
76 |
|
77 |
-
# VisibleLevel=2 尚未充分测试的函数插件,放在这里
|
78 |
-
if UserVisibleLevel >= 2:
|
79 |
-
function_plugins.update({
|
80 |
-
})
|
81 |
|
|
|
82 |
return function_plugins
|
83 |
|
84 |
|
|
|
1 |
from toolbox import HotReload # HotReload 的意思是热更新,修改函数插件后,不需要重启程序,代码直接生效
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
def get_crazy_functionals():
|
4 |
+
###################### 第一组插件 ###########################
|
5 |
+
# [第一组插件]: 最早期编写的项目插件和一些demo
|
6 |
from crazy_functions.读文章写摘要 import 读文章写摘要
|
7 |
from crazy_functions.生成函数注释 import 批量生成函数注释
|
8 |
from crazy_functions.解析项目源代码 import 解析项目本身
|
|
|
47 |
"Function": HotReload(高阶功能模板函数)
|
48 |
},
|
49 |
}
|
50 |
+
###################### 第二组插件 ###########################
|
51 |
+
# [第二组插件]: 经过充分测试,但功能上距离达到完美状态还差一点点
|
52 |
+
from crazy_functions.批量总结PDF文档 import 批量总结PDF文档
|
53 |
+
from crazy_functions.批量总结PDF文档pdfminer import 批量总结PDF文档pdfminer
|
54 |
+
from crazy_functions.总结word文档 import 总结word文档
|
55 |
+
function_plugins.update({
|
56 |
+
"[仅供开发调试] 批量总结PDF文档": {
|
57 |
+
"Color": "stop",
|
58 |
+
"Function": HotReload(批量总结PDF文档) # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
59 |
+
},
|
60 |
+
"[仅供开发调试] 批量总结PDF文档pdfminer": {
|
61 |
+
"Color": "stop",
|
62 |
+
"AsButton": False, # 加入下拉菜单中
|
63 |
+
"Function": HotReload(批量总结PDF文档pdfminer)
|
64 |
+
},
|
65 |
+
"[仅供开发调试] 批量总结Word文档": {
|
66 |
+
"Color": "stop",
|
67 |
+
"Function": HotReload(总结word文档)
|
68 |
+
},
|
69 |
+
})
|
70 |
|
71 |
+
###################### 第三组插件 ###########################
|
72 |
+
# [第三组插件]: 尚未充分测试的函数插件,放在这里
|
73 |
+
try:
|
74 |
+
from crazy_functions.下载arxiv论文翻译摘要 import 下载arxiv论文并翻译摘要
|
|
|
75 |
function_plugins.update({
|
76 |
+
"下载arxiv论文并翻译摘要": {
|
|
|
|
|
|
|
|
|
77 |
"Color": "stop",
|
78 |
"AsButton": False, # 加入下拉菜单中
|
79 |
+
"Function": HotReload(下载arxiv论文并翻译摘要)
|
80 |
+
}
|
|
|
|
|
|
|
|
|
81 |
})
|
82 |
+
except Exception as err:
|
83 |
+
print(f'[下载arxiv论文并翻译摘要] 插件导入失败 {str(err)}')
|
84 |
+
|
85 |
|
|
|
|
|
|
|
|
|
86 |
|
87 |
+
###################### 第n组插件 ###########################
|
88 |
return function_plugins
|
89 |
|
90 |
|
main.py
CHANGED
@@ -119,7 +119,7 @@ with gr.Blocks(theme=set_theme, analytics_enabled=False, css=advanced_css) as de
|
|
119 |
dropdown.select(on_dropdown_changed, [dropdown], [switchy_bt] )
|
120 |
# 随变按钮的回调函数注册
|
121 |
def route(k, *args, **kwargs):
|
122 |
-
if k in [r"打开插件列表", r"
|
123 |
yield from crazy_fns[k]["Function"](*args, **kwargs)
|
124 |
click_handle = switchy_bt.click(route,[switchy_bt, *input_combo, gr.State(PORT)], output_combo)
|
125 |
click_handle.then(on_report_generated, [file_upload, chatbot], [file_upload, chatbot])
|
|
|
119 |
dropdown.select(on_dropdown_changed, [dropdown], [switchy_bt] )
|
120 |
# 随变按钮的回调函数注册
|
121 |
def route(k, *args, **kwargs):
|
122 |
+
if k in [r"打开插件列表", r"请先从插件列表中选择"]: return
|
123 |
yield from crazy_fns[k]["Function"](*args, **kwargs)
|
124 |
click_handle = switchy_bt.click(route,[switchy_bt, *input_combo, gr.State(PORT)], output_combo)
|
125 |
click_handle.then(on_report_generated, [file_upload, chatbot], [file_upload, chatbot])
|