Spaces:
Running
Running
update
Browse files- main.py +72 -0
- token_examples.json +3 -0
- toolbox/tokenization/__init__.py +6 -0
- toolbox/tokenization/pyltp_tokenization.py +32 -0
- toolbox/tokenization/tokenization.py +28 -0
main.py
CHANGED
@@ -36,6 +36,11 @@ from toolbox.sementic_role_labeling.sementic_role_labeling import (
|
|
36 |
engine_to_tagger as srl_engine_to_tagger,
|
37 |
srl
|
38 |
)
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
main_logger = logging.getLogger("main")
|
41 |
|
@@ -58,6 +63,11 @@ def get_args():
|
|
58 |
default=(project_path / "srl_examples.json").as_posix(),
|
59 |
type=str
|
60 |
)
|
|
|
|
|
|
|
|
|
|
|
61 |
args = parser.parse_args()
|
62 |
return args
|
63 |
|
@@ -129,6 +139,25 @@ def run_srl(text: str, language: str, engine: str) -> str:
|
|
129 |
return result
|
130 |
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
def shell(cmd: str):
|
133 |
return Command.popen(cmd)
|
134 |
|
@@ -142,6 +171,8 @@ def main():
|
|
142 |
pos_examples: list = json.load(f)
|
143 |
with open(args.srl_example_json_file, "r", encoding="utf-8") as f:
|
144 |
srl_examples: list = json.load(f)
|
|
|
|
|
145 |
|
146 |
# blocks
|
147 |
with gr.Blocks() as blocks:
|
@@ -273,6 +304,47 @@ def main():
|
|
273 |
fn=run_ner,
|
274 |
)
|
275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
with gr.TabItem("shell"):
|
277 |
shell_text = gr.Textbox(label="cmd")
|
278 |
shell_button = gr.Button("run")
|
|
|
36 |
engine_to_tagger as srl_engine_to_tagger,
|
37 |
srl
|
38 |
)
|
39 |
+
from toolbox.tokenization.tokenization import (
|
40 |
+
language_to_engines as t_language_to_engines,
|
41 |
+
engine_to_tagger as t_engine_to_tagger,
|
42 |
+
tokenize
|
43 |
+
)
|
44 |
|
45 |
main_logger = logging.getLogger("main")
|
46 |
|
|
|
63 |
default=(project_path / "srl_examples.json").as_posix(),
|
64 |
type=str
|
65 |
)
|
66 |
+
parser.add_argument(
|
67 |
+
"--token_example_json_file",
|
68 |
+
default=(project_path / "token_examples.json").as_posix(),
|
69 |
+
type=str
|
70 |
+
)
|
71 |
args = parser.parse_args()
|
72 |
return args
|
73 |
|
|
|
139 |
return result
|
140 |
|
141 |
|
142 |
+
def run_tokenization(text: str, language: str, engine: str) -> str:
|
143 |
+
try:
|
144 |
+
main_logger.info(f"tokenization started. text: {text}, language: {language}, engine: {engine}")
|
145 |
+
|
146 |
+
begin = time.time()
|
147 |
+
|
148 |
+
words = tokenize(text, language, engine)
|
149 |
+
result = ""
|
150 |
+
for word in words:
|
151 |
+
result += f"{word}\t"
|
152 |
+
|
153 |
+
time_cost = time.time() - begin
|
154 |
+
result += f"\n\ntime_cost: {round(time_cost, 4)}"
|
155 |
+
return result
|
156 |
+
except Exception as e:
|
157 |
+
result = f"{type(e)}\n{str(e)}"
|
158 |
+
return result
|
159 |
+
|
160 |
+
|
161 |
def shell(cmd: str):
|
162 |
return Command.popen(cmd)
|
163 |
|
|
|
171 |
pos_examples: list = json.load(f)
|
172 |
with open(args.srl_example_json_file, "r", encoding="utf-8") as f:
|
173 |
srl_examples: list = json.load(f)
|
174 |
+
with open(args.token_example_json_file, "r", encoding="utf-8") as f:
|
175 |
+
token_examples: list = json.load(f)
|
176 |
|
177 |
# blocks
|
178 |
with gr.Blocks() as blocks:
|
|
|
304 |
fn=run_ner,
|
305 |
)
|
306 |
|
307 |
+
with gr.TabItem("tokenization"):
|
308 |
+
def t_get_languages_by_engine(engine: str):
|
309 |
+
language_list = list()
|
310 |
+
for k, v in t_language_to_engines.items():
|
311 |
+
if engine in v:
|
312 |
+
language_list.append(k)
|
313 |
+
return gr.Dropdown(choices=language_list, value=language_list[0], label="language")
|
314 |
+
|
315 |
+
t_language_choices = list(t_language_to_engines.keys())
|
316 |
+
t_engine_choices = list(t_engine_to_tagger.keys())
|
317 |
+
|
318 |
+
t_text = gr.Textbox(value="学而时习之,不亦悦乎。", lines=4, max_lines=50, label="text")
|
319 |
+
|
320 |
+
with gr.Row():
|
321 |
+
t_language = gr.Dropdown(
|
322 |
+
choices=t_language_choices, value=t_language_choices[0],
|
323 |
+
label="language"
|
324 |
+
)
|
325 |
+
t_engine = gr.Dropdown(
|
326 |
+
choices=t_engine_choices, value=t_engine_choices[0],
|
327 |
+
label="engine"
|
328 |
+
)
|
329 |
+
t_engine.change(
|
330 |
+
t_get_languages_by_engine,
|
331 |
+
inputs=[t_engine],
|
332 |
+
outputs=[t_language],
|
333 |
+
)
|
334 |
+
t_output = gr.Textbox(lines=4, max_lines=50, label="output")
|
335 |
+
t_button = gr.Button(value="pos_tag", variant="primary")
|
336 |
+
t_button.click(
|
337 |
+
run_tokenization,
|
338 |
+
inputs=[t_text, t_language, t_engine],
|
339 |
+
outputs=[t_output],
|
340 |
+
)
|
341 |
+
gr.Examples(
|
342 |
+
examples=token_examples,
|
343 |
+
inputs=[t_text, t_language, t_engine],
|
344 |
+
outputs=[t_output],
|
345 |
+
fn=run_tokenization,
|
346 |
+
)
|
347 |
+
|
348 |
with gr.TabItem("shell"):
|
349 |
shell_text = gr.Textbox(label="cmd")
|
350 |
shell_button = gr.Button("run")
|
token_examples.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
["元芳你怎么看?我就趴窗口上看呗!", "chinese", "pyltp"]
|
3 |
+
]
|
toolbox/tokenization/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
|
5 |
+
if __name__ == "__main__":
|
6 |
+
pass
|
toolbox/tokenization/pyltp_tokenization.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
from functools import lru_cache
|
4 |
+
import os
|
5 |
+
from typing import List
|
6 |
+
|
7 |
+
ltp_data_dir = os.environ.get("LTP_DATA_DIR")
|
8 |
+
|
9 |
+
from pyltp import Segmentor
|
10 |
+
|
11 |
+
|
12 |
+
@lru_cache(maxsize=5)
|
13 |
+
def get_pyltp_tokenizer():
|
14 |
+
global ltp_data_dir
|
15 |
+
|
16 |
+
cws_model_path = os.path.join(ltp_data_dir, "cws.model")
|
17 |
+
|
18 |
+
segmentor = Segmentor(cws_model_path)
|
19 |
+
|
20 |
+
return segmentor
|
21 |
+
|
22 |
+
|
23 |
+
def pyltp_tokenize(text: str, language: str) -> List[str]:
|
24 |
+
segmentor = get_pyltp_tokenizer()
|
25 |
+
|
26 |
+
words = segmentor.segment(text)
|
27 |
+
|
28 |
+
return words
|
29 |
+
|
30 |
+
|
31 |
+
if __name__ == "__main__":
|
32 |
+
pass
|
toolbox/tokenization/tokenization.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
from typing import Callable, Dict, List
|
4 |
+
|
5 |
+
from toolbox.tokenization.pyltp_tokenization import pyltp_tokenize
|
6 |
+
|
7 |
+
|
8 |
+
language_to_engines = {
|
9 |
+
"chinese": ["pyltp"]
|
10 |
+
}
|
11 |
+
|
12 |
+
|
13 |
+
engine_to_tagger: Dict[str, Callable] = {
|
14 |
+
"pyltp": pyltp_tokenize
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
def tokenize(text: str, language: str, engine: str) -> List[str]:
|
19 |
+
tokenizer = engine_to_tagger.get(engine)
|
20 |
+
if tokenizer is None:
|
21 |
+
raise AssertionError(f"engine {engine} not supported.")
|
22 |
+
|
23 |
+
words = tokenizer(text, language)
|
24 |
+
return words
|
25 |
+
|
26 |
+
|
27 |
+
if __name__ == "__main__":
|
28 |
+
pass
|