fukufuk
commited on
Commit
•
318adba
1
Parent(s):
0398d77
Update application file
Browse files- app.py +54 -22
- {util → lib}/__init__.py +3 -0
- lib/__pycache__/__init__.cpython-311.pyc +0 -0
- {util → lib}/__pycache__/fact_opinion_classifer.cpython-311.pyc +0 -0
- {util → lib}/__pycache__/fact_summarizer.cpython-311.pyc +0 -0
- lib/__pycache__/get_data_info.cpython-311.pyc +0 -0
- lib/__pycache__/get_raw_data_html.cpython-311.pyc +0 -0
- lib/__pycache__/get_raw_data_info.cpython-311.pyc +0 -0
- {util → lib}/__pycache__/opinion_reason_classifer.cpython-311.pyc +0 -0
- {util → lib}/fact_opinion_classifer.py +0 -0
- {util → lib}/fact_summarizer.py +0 -0
- lib/get_data_info.py +38 -0
- lib/get_raw_data_html.py +21 -0
- lib/get_raw_data_info.py +33 -0
- {util → lib}/opinion_reason_classifer.py +0 -0
- util/__pycache__/__init__.cpython-311.pyc +0 -0
app.py
CHANGED
@@ -6,8 +6,9 @@ import requests
|
|
6 |
from bs4 import BeautifulSoup as bs
|
7 |
from openai import OpenAI
|
8 |
|
9 |
-
from
|
10 |
-
|
|
|
11 |
|
12 |
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
13 |
|
@@ -17,46 +18,62 @@ def main(url):
|
|
17 |
soup = bs(response.text, 'html.parser')
|
18 |
article_elems = soup.select('#uamods > .article_body > div > p')
|
19 |
sentence_text = "\n".join([article_elem.text for article_elem in article_elems])
|
20 |
-
|
21 |
-
sentences = [line.strip() for line in re.split('。|?|\n|!', message) if line.strip() != ""]
|
22 |
|
23 |
fact_opinion_list = fact_opinion_classifer(client, sentences)
|
24 |
|
25 |
if len(sentences) != len(fact_opinion_list):
|
26 |
raise ValueError(f'GPTの回答の数が一致しませんでした。: {fact_opinion_list}, {sentences}')
|
|
|
|
|
|
|
27 |
|
28 |
opinions = []
|
29 |
facts = []
|
30 |
for sentence, fact_opinion in zip(sentences, fact_opinion_list):
|
31 |
if fact_opinion == "Opinion":
|
32 |
-
|
|
|
33 |
opinions.append(sentence)
|
34 |
elif fact_opinion == "Fact":
|
|
|
|
|
35 |
facts.append(sentence)
|
36 |
else:
|
37 |
print(f'error: not known fact_opinion option: {fact_opinion}')
|
38 |
|
39 |
-
|
|
|
40 |
|
41 |
-
|
|
|
|
|
42 |
|
43 |
if len(opinions) == 0:
|
44 |
opinion_reason_text = "<h3>この文章には意見に関する文が見つかりませんでした。</h3>"
|
45 |
else:
|
46 |
opinion_reasons = opinion_reason_classifer(client, opinions)
|
|
|
|
|
47 |
opinion_reason_text = ""
|
48 |
for opinion, reason in zip(opinions, opinion_reasons):
|
49 |
-
opinion_reason_text +=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
html_txt = f"""<div style="padding: 10px; overflow-x: scroll; border: 1px #999999 solid; height: 500px;"> {message}</div>"""
|
52 |
opinion_reason_text = f"""<div style="padding: 10px; overflow-x: scroll; border: 1px #999999 solid; height: 450px;"> {opinion_reason_text}</div>"""
|
53 |
-
return
|
54 |
|
55 |
|
56 |
if __name__ == "__main__":
|
57 |
'''main'''
|
58 |
with gr.Blocks(
|
59 |
-
title='
|
60 |
theme='shivi/calm_seafoam',
|
61 |
css='''
|
62 |
#title_{
|
@@ -86,15 +103,28 @@ if __name__ == "__main__":
|
|
86 |
|
87 |
btn = gr.Button("Enter")
|
88 |
with gr.Row():
|
89 |
-
with gr.
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
with gr.Row():
|
99 |
with gr.Column():
|
100 |
gr.Markdown(value='''
|
@@ -134,6 +164,8 @@ if __name__ == "__main__":
|
|
134 |
</p>
|
135 |
''')
|
136 |
|
137 |
-
inp.submit(main, [inp], [
|
138 |
-
|
|
|
|
|
139 |
demo.launch(auth=(os.getenv('USER_NAME'), os.getenv('PASSWORD')))
|
|
|
6 |
from bs4 import BeautifulSoup as bs
|
7 |
from openai import OpenAI
|
8 |
|
9 |
+
from lib import (fact_opinion_classifer, fact_summarizer, get_data_info,
|
10 |
+
get_raw_data_html, get_raw_data_info,
|
11 |
+
opinion_reason_classifer)
|
12 |
|
13 |
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
14 |
|
|
|
18 |
soup = bs(response.text, 'html.parser')
|
19 |
article_elems = soup.select('#uamods > .article_body > div > p')
|
20 |
sentence_text = "\n".join([article_elem.text for article_elem in article_elems])
|
21 |
+
sentences = [line.strip() for line in re.split('。|?|\n|!', sentence_text) if line.strip() != ""]
|
|
|
22 |
|
23 |
fact_opinion_list = fact_opinion_classifer(client, sentences)
|
24 |
|
25 |
if len(sentences) != len(fact_opinion_list):
|
26 |
raise ValueError(f'GPTの回答の数が一致しませんでした。: {fact_opinion_list}, {sentences}')
|
27 |
+
|
28 |
+
fact_main_message = '' + sentence_text
|
29 |
+
opinion_main_message = '' + sentence_text
|
30 |
|
31 |
opinions = []
|
32 |
facts = []
|
33 |
for sentence, fact_opinion in zip(sentences, fact_opinion_list):
|
34 |
if fact_opinion == "Opinion":
|
35 |
+
opinion_main_message = opinion_main_message.replace(sentence, f'<b>{sentence}</b>')
|
36 |
+
fact_main_message = fact_main_message.replace(sentence, f'<span style="color: gray;">{sentence}</span>')
|
37 |
opinions.append(sentence)
|
38 |
elif fact_opinion == "Fact":
|
39 |
+
fact_main_message = fact_main_message.replace(sentence, f'<b>{sentence}</b>')
|
40 |
+
opinion_main_message = opinion_main_message.replace(sentence, f'<span style="color: gray;">{sentence}</span>')
|
41 |
facts.append(sentence)
|
42 |
else:
|
43 |
print(f'error: not known fact_opinion option: {fact_opinion}')
|
44 |
|
45 |
+
opinion_main_message = '<p>' + opinion_main_message.replace('\n', '</p>\n<p>') + '</p>'
|
46 |
+
fact_main_message = '<p>' + fact_main_message.replace('\n', '</p>\n<p>') + '</p>'
|
47 |
|
48 |
+
html_format = """<div style="padding: 10px; overflow-x: scroll; border: 1px #999999 solid; height: 450px;"> {message}</div>"""
|
49 |
+
opinion_main_message = html_format.format(message=opinion_main_message)
|
50 |
+
fact_main_message = html_format.format(message=fact_main_message)
|
51 |
|
52 |
if len(opinions) == 0:
|
53 |
opinion_reason_text = "<h3>この文章には意見に関する文が見つかりませんでした。</h3>"
|
54 |
else:
|
55 |
opinion_reasons = opinion_reason_classifer(client, opinions)
|
56 |
+
all_opinion_data_html_format = "<ul>{all_data}</ul>"
|
57 |
+
opinion_data_html_format = """<li style="list-style: none; border-bottom:1px solid; border-top:1px solid; padding: 6px;"><div><b>{text}</b></div><div><details><summary><b>解釈</b></summary><span style="color: red;">{reason}</span></details></div></li>"""
|
58 |
opinion_reason_text = ""
|
59 |
for opinion, reason in zip(opinions, opinion_reasons):
|
60 |
+
opinion_reason_text += opinion_data_html_format.format(text=opinion, reason=reason)
|
61 |
+
opinion_reason_html = all_opinion_data_html_format.format(all_data=opinion_reason_text)
|
62 |
+
|
63 |
+
fact_sentence = fact_summarizer(client, sentence_text)
|
64 |
+
|
65 |
+
data_texts = get_data_info(client, facts)
|
66 |
+
predicted_data = get_raw_data_info(client, data_texts)
|
67 |
+
data_main_message, raw_data_html_txt = get_raw_data_html(sentence_text, data_texts, predicted_data)
|
68 |
|
|
|
69 |
opinion_reason_text = f"""<div style="padding: 10px; overflow-x: scroll; border: 1px #999999 solid; height: 450px;"> {opinion_reason_text}</div>"""
|
70 |
+
return fact_main_message, opinion_main_message, opinion_reason_html, fact_sentence, raw_data_html_txt, data_main_message
|
71 |
|
72 |
|
73 |
if __name__ == "__main__":
|
74 |
'''main'''
|
75 |
with gr.Blocks(
|
76 |
+
title='情報リテラシーサポートツール',
|
77 |
theme='shivi/calm_seafoam',
|
78 |
css='''
|
79 |
#title_{
|
|
|
103 |
|
104 |
btn = gr.Button("Enter")
|
105 |
with gr.Row():
|
106 |
+
with gr.Tab('事実'):
|
107 |
+
with gr.Row():
|
108 |
+
with gr.Column(scale=1):
|
109 |
+
out_fact_0 = gr.HTML("""<div style="padding: 10px; overflow-x: scroll; border: 1px #999999 solid; height: 450px;"><span style="opacity: 0.5;">こちらに本文が表示されます。</span></div>""")
|
110 |
+
with gr.Column(scale=1):
|
111 |
+
gr.Markdown("## 本文から確実に言えること")
|
112 |
+
out_fact_1 = gr.Markdown("""<div style="padding: 10px; overflow-x: scroll; border: 1px #999999 solid; height: 400px;"><span style="opacity: 0.5;">こちらに分析内容が表示されます。</span></div>""")
|
113 |
+
with gr.Tab('意見'):
|
114 |
+
with gr.Row():
|
115 |
+
with gr.Column(scale=1):
|
116 |
+
out_opinion_0 = gr.HTML("""<div style="padding: 10px; overflow-x: scroll; border: 1px #999999 solid; height: 450px;"><span style="opacity: 0.5;">こちらに本文が表示されます。</span></div>""")
|
117 |
+
with gr.Column(scale=1):
|
118 |
+
gr.Markdown("## 意見文分析")
|
119 |
+
out_opinion_1 = gr.HTML("""<div style="padding: 10px; overflow-x: scroll; border: 1px #999999 solid; height: 400px;"><span style="opacity: 0.5;">こちらに分析内容が表示されます。</span></div>""")
|
120 |
+
with gr.Tab('データ'):
|
121 |
+
with gr.Row():
|
122 |
+
with gr.Column(scale=1):
|
123 |
+
out_data_0 = gr.HTML("""<div style="padding: 10px; overflow-x: scroll; border: 1px #999999 solid; height: 450px;"><span style="opacity: 0.5;">こちらに本文が表示されます。</span></div>""")
|
124 |
+
with gr.Column(scale=1):
|
125 |
+
gr.Markdown("## 加工データ推測")
|
126 |
+
out_data_1 = gr.HTML("""<div style="padding: 10px; overflow-x: scroll; border: 1px #999999 solid; height: 400px;"><span style="opacity: 0.5;">こちらに分析内容が表示されます。</span></div>""")
|
127 |
+
|
128 |
with gr.Row():
|
129 |
with gr.Column():
|
130 |
gr.Markdown(value='''
|
|
|
164 |
</p>
|
165 |
''')
|
166 |
|
167 |
+
inp.submit(main, [inp], [out_fact_0, out_opinion_0, out_opinion_1,
|
168 |
+
out_fact_1, out_data_1, out_data_0])
|
169 |
+
btn.click(main, [inp], [out_fact_0, out_opinion_0, out_opinion_1,
|
170 |
+
out_fact_1, out_data_1, out_data_0])
|
171 |
demo.launch(auth=(os.getenv('USER_NAME'), os.getenv('PASSWORD')))
|
{util → lib}/__init__.py
RENAMED
@@ -1,3 +1,6 @@
|
|
1 |
from .fact_opinion_classifer import fact_opinion_classifer
|
2 |
from .fact_summarizer import fact_summarizer
|
|
|
|
|
|
|
3 |
from .opinion_reason_classifer import opinion_reason_classifer
|
|
|
1 |
from .fact_opinion_classifer import fact_opinion_classifer
|
2 |
from .fact_summarizer import fact_summarizer
|
3 |
+
from .get_data_info import get_data_info
|
4 |
+
from .get_raw_data_html import get_raw_data_html
|
5 |
+
from .get_raw_data_info import get_raw_data_info
|
6 |
from .opinion_reason_classifer import opinion_reason_classifer
|
lib/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (539 Bytes). View file
|
|
{util → lib}/__pycache__/fact_opinion_classifer.cpython-311.pyc
RENAMED
Binary files a/util/__pycache__/fact_opinion_classifer.cpython-311.pyc and b/lib/__pycache__/fact_opinion_classifer.cpython-311.pyc differ
|
|
{util → lib}/__pycache__/fact_summarizer.cpython-311.pyc
RENAMED
File without changes
|
lib/__pycache__/get_data_info.cpython-311.pyc
ADDED
Binary file (3.71 kB). View file
|
|
lib/__pycache__/get_raw_data_html.cpython-311.pyc
ADDED
Binary file (2.09 kB). View file
|
|
lib/__pycache__/get_raw_data_info.cpython-311.pyc
ADDED
Binary file (2.89 kB). View file
|
|
{util → lib}/__pycache__/opinion_reason_classifer.cpython-311.pyc
RENAMED
Binary files a/util/__pycache__/opinion_reason_classifer.cpython-311.pyc and b/lib/__pycache__/opinion_reason_classifer.cpython-311.pyc differ
|
|
{util → lib}/fact_opinion_classifer.py
RENAMED
File without changes
|
{util → lib}/fact_summarizer.py
RENAMED
File without changes
|
lib/get_data_info.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
from openai import OpenAI
|
4 |
+
|
5 |
+
REQUIREMENT = """以下の箇条書きの文章群から、数値データや表データを含む箇所をそのまま抜き出してください。数値や表が含まれていない場合、その文章については None としてください。具体的には以下の条件に従ってください。
|
6 |
+
|
7 |
+
### 条件
|
8 |
+
- 数値データや表データを含む箇所をそのまま抜き出す
|
9 |
+
- 数値や表がない場合は None を返す
|
10 |
+
- 結果を箇条書きで表示する
|
11 |
+
- 複数データが含まれる場合はスラッシュ(/)区切りで一行に複数記す"""
|
12 |
+
|
13 |
+
EXAMPLE_QUESTION = """1. TSRの倒産集計における業種分類は、総務省の日本産業標準分類に準拠している。
|
14 |
+
2. 2024年1-8月の倒産は、ラーメン店が44件(前年同期比57.1%増)で、2009年からの統計では同期間で最多だった2020年の31件を大幅に上回っている。
|
15 |
+
3. 2009年以降の倒産動向をみると、ラーメン店は2013年に29件を記録した。その後、インバウンド(訪日外国人客)の来店増などで2016年は16件まで減少した"""
|
16 |
+
|
17 |
+
EXAMPLE_ANSWER = """1. None
|
18 |
+
2. 2024年1-8月の倒産は、ラーメン店が44件(前年同期比57.1%増)/ 2009年からの統計では同期間で最多だった2020年の31件
|
19 |
+
3. ラーメン店は2013年に29件を記録した/ 2016年は16件まで減少した"""
|
20 |
+
|
21 |
+
|
22 |
+
def get_data_info(client: OpenAI, facts, model_name='gpt-4o-mini-2024-07-18'):
|
23 |
+
message = "\n".join([f'{i+1}. {q}' for i, q in enumerate(facts)])
|
24 |
+
response = client.chat.completions.create(
|
25 |
+
messages=[
|
26 |
+
{"role": "user","content": REQUIREMENT},
|
27 |
+
{"role": "user","content": EXAMPLE_QUESTION},
|
28 |
+
{"role": "assistant","content": EXAMPLE_ANSWER},
|
29 |
+
{"role": "user","content": message}
|
30 |
+
],
|
31 |
+
model=model_name,
|
32 |
+
temperature=0,
|
33 |
+
)
|
34 |
+
res_text = response.choices[0].message.content
|
35 |
+
predicted_data = re.findall(r'\d{1,2}\. (.+)\n*', res_text)
|
36 |
+
predicted_data = [[d_.strip() for d_ in d.split('/')] for d in predicted_data]
|
37 |
+
predicted_data = '/'.join(['/'.join(d) for d in predicted_data if d != ['None']]).split('/')
|
38 |
+
return predicted_data
|
lib/get_raw_data_html.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_raw_data_html(sentence_text: str, data_texts: list, predicted_data: list):
|
2 |
+
data_main_message = f'{sentence_text}'
|
3 |
+
all_raw_data_html_format = "<h3>[生データ一覧]<h4><ul>{all_data}</ul>"
|
4 |
+
raw_data_html_format = """<li style="list-style: none; border-bottom:1px solid; border-top:1px solid; padding: 6px;"><b>{text}<b></li>"""
|
5 |
+
all_processed_data_html_format = "<h3>[加工データ一覧]<h4><ul>{all_data}</ul>"
|
6 |
+
processed_data_html_format = """<li style="list-style: none; border-bottom:1px solid; border-top:1px solid; padding: 6px;"><div><b>{text}</b></div><div><details><summary>加工内容</summary><span style="color: red;">{reason}</span></details></div></li>"""
|
7 |
+
|
8 |
+
all_raw_data = ""
|
9 |
+
all_processed_data = ""
|
10 |
+
for text, data in zip(data_texts, predicted_data):
|
11 |
+
if data[0] == '生データ':
|
12 |
+
all_raw_data += raw_data_html_format.format(text=text)
|
13 |
+
data_main_message = data_main_message.replace(text, f'<span style="background-color: #00ff00">{text}</span>')
|
14 |
+
elif data[0] == '加工データ':
|
15 |
+
all_processed_data += processed_data_html_format.format(text=text, reason=data[1])
|
16 |
+
data_main_message = data_main_message.replace(text, f'<span style="background-color: #ffff00">{text}</span>')
|
17 |
+
else:
|
18 |
+
pass
|
19 |
+
all_raw_data_html = all_raw_data_html_format.format(all_data=all_raw_data)
|
20 |
+
all_processed_data_html = all_processed_data_html_format.format(all_data=all_processed_data)
|
21 |
+
return data_main_message, all_raw_data_html + "\n<hr>\n" + all_processed_data_html
|
lib/get_raw_data_info.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
from openai import OpenAI
|
4 |
+
|
5 |
+
REQUIREMENT_ = """以下の箇条書きのデータ群を分析し、「生データ」か「加工データ」かを推定してください。加工データである場合、スラッシュ(/)区切りで加工過程の説明も追記してください。
|
6 |
+
生データ: 収集されたままの加工されていないデータ
|
7 |
+
加工データ: 処理・変換され理解しやすくされたデータ"""
|
8 |
+
|
9 |
+
EXAMPLE_QUESTION_ = """1. 2024年1-8月の倒産は、ラーメン店が44件(前年同期比57.1%増)
|
10 |
+
2. ラーメン店は2013年に29件を記録した
|
11 |
+
3. 2009年からの統計では同期間で最多だった2020年の31件"""
|
12 |
+
|
13 |
+
EXAMPLE_ANSWER_ = """1. 加工データ / 前年との比較を割合で示すことで理解しやすくされています。
|
14 |
+
2. 生データ
|
15 |
+
3. 加工データ / 過去のデータと比べて数が最も多かったことを示すことで数値の理解を助けています。"""
|
16 |
+
|
17 |
+
|
18 |
+
def get_raw_data_info(client: OpenAI, data_texts, model_name='gpt-4o-mini-2024-07-18'):
|
19 |
+
message = "\n".join([f'{i+1}. {q}' for i, q in enumerate(data_texts)])
|
20 |
+
response = client.chat.completions.create(
|
21 |
+
messages=[
|
22 |
+
{"role": "user","content": REQUIREMENT_},
|
23 |
+
{"role": "user","content": EXAMPLE_QUESTION_},
|
24 |
+
{"role": "assistant","content": EXAMPLE_ANSWER_},
|
25 |
+
{"role": "user","content": message}
|
26 |
+
],
|
27 |
+
model=model_name,
|
28 |
+
temperature=0,
|
29 |
+
)
|
30 |
+
res_text = response.choices[0].message.content
|
31 |
+
predicted_data = re.findall(r'\d{1,2}\. (.+)\n*', res_text)
|
32 |
+
predicted_data = [[d_.strip() for d_ in d.split('/')] for d in predicted_data]
|
33 |
+
return predicted_data
|
{util → lib}/opinion_reason_classifer.py
RENAMED
File without changes
|
util/__pycache__/__init__.cpython-311.pyc
DELETED
Binary file (376 Bytes)
|
|