HoneyTian commited on
Commit
5839f86
1 Parent(s): 845e414
main.py CHANGED
@@ -36,6 +36,11 @@ from toolbox.sementic_role_labeling.sementic_role_labeling import (
36
  engine_to_tagger as srl_engine_to_tagger,
37
  srl
38
  )
 
 
 
 
 
39
 
40
  main_logger = logging.getLogger("main")
41
 
@@ -58,6 +63,11 @@ def get_args():
58
  default=(project_path / "srl_examples.json").as_posix(),
59
  type=str
60
  )
 
 
 
 
 
61
  args = parser.parse_args()
62
  return args
63
 
@@ -129,6 +139,25 @@ def run_srl(text: str, language: str, engine: str) -> str:
129
  return result
130
 
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  def shell(cmd: str):
133
  return Command.popen(cmd)
134
 
@@ -142,6 +171,8 @@ def main():
142
  pos_examples: list = json.load(f)
143
  with open(args.srl_example_json_file, "r", encoding="utf-8") as f:
144
  srl_examples: list = json.load(f)
 
 
145
 
146
  # blocks
147
  with gr.Blocks() as blocks:
@@ -273,6 +304,47 @@ def main():
273
  fn=run_ner,
274
  )
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  with gr.TabItem("shell"):
277
  shell_text = gr.Textbox(label="cmd")
278
  shell_button = gr.Button("run")
 
36
  engine_to_tagger as srl_engine_to_tagger,
37
  srl
38
  )
39
+ from toolbox.tokenization.tokenization import (
40
+ language_to_engines as t_language_to_engines,
41
+ engine_to_tagger as t_engine_to_tagger,
42
+ tokenize
43
+ )
44
 
45
  main_logger = logging.getLogger("main")
46
 
 
63
  default=(project_path / "srl_examples.json").as_posix(),
64
  type=str
65
  )
66
+ parser.add_argument(
67
+ "--token_example_json_file",
68
+ default=(project_path / "token_examples.json").as_posix(),
69
+ type=str
70
+ )
71
  args = parser.parse_args()
72
  return args
73
 
 
139
  return result
140
 
141
 
142
+ def run_tokenization(text: str, language: str, engine: str) -> str:
143
+ try:
144
+ main_logger.info(f"tokenization started. text: {text}, language: {language}, engine: {engine}")
145
+
146
+ begin = time.time()
147
+
148
+ words = tokenize(text, language, engine)
149
+ result = ""
150
+ for word in words:
151
+ result += f"{word}\t"
152
+
153
+ time_cost = time.time() - begin
154
+ result += f"\n\ntime_cost: {round(time_cost, 4)}"
155
+ return result
156
+ except Exception as e:
157
+ result = f"{type(e)}\n{str(e)}"
158
+ return result
159
+
160
+
161
  def shell(cmd: str):
162
  return Command.popen(cmd)
163
 
 
171
  pos_examples: list = json.load(f)
172
  with open(args.srl_example_json_file, "r", encoding="utf-8") as f:
173
  srl_examples: list = json.load(f)
174
+ with open(args.token_example_json_file, "r", encoding="utf-8") as f:
175
+ token_examples: list = json.load(f)
176
 
177
  # blocks
178
  with gr.Blocks() as blocks:
 
304
  fn=run_ner,
305
  )
306
 
307
+ with gr.TabItem("tokenization"):
308
+ def t_get_languages_by_engine(engine: str):
309
+ language_list = list()
310
+ for k, v in t_language_to_engines.items():
311
+ if engine in v:
312
+ language_list.append(k)
313
+ return gr.Dropdown(choices=language_list, value=language_list[0], label="language")
314
+
315
+ t_language_choices = list(t_language_to_engines.keys())
316
+ t_engine_choices = list(t_engine_to_tagger.keys())
317
+
318
+ t_text = gr.Textbox(value="学而时习之,不亦悦乎。", lines=4, max_lines=50, label="text")
319
+
320
+ with gr.Row():
321
+ t_language = gr.Dropdown(
322
+ choices=t_language_choices, value=t_language_choices[0],
323
+ label="language"
324
+ )
325
+ t_engine = gr.Dropdown(
326
+ choices=t_engine_choices, value=t_engine_choices[0],
327
+ label="engine"
328
+ )
329
+ t_engine.change(
330
+ t_get_languages_by_engine,
331
+ inputs=[t_engine],
332
+ outputs=[t_language],
333
+ )
334
+ t_output = gr.Textbox(lines=4, max_lines=50, label="output")
335
+ t_button = gr.Button(value="pos_tag", variant="primary")
336
+ t_button.click(
337
+ run_tokenization,
338
+ inputs=[t_text, t_language, t_engine],
339
+ outputs=[t_output],
340
+ )
341
+ gr.Examples(
342
+ examples=token_examples,
343
+ inputs=[t_text, t_language, t_engine],
344
+ outputs=[t_output],
345
+ fn=run_tokenization,
346
+ )
347
+
348
  with gr.TabItem("shell"):
349
  shell_text = gr.Textbox(label="cmd")
350
  shell_button = gr.Button("run")
token_examples.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [
2
+ ["元芳你怎么看?我就趴窗口上看呗!", "chinese", "pyltp"]
3
+ ]
toolbox/tokenization/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == "__main__":
6
+ pass
toolbox/tokenization/pyltp_tokenization.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from functools import lru_cache
4
+ import os
5
+ from typing import List
6
+
7
+ ltp_data_dir = os.environ.get("LTP_DATA_DIR")
8
+
9
+ from pyltp import Segmentor
10
+
11
+
12
+ @lru_cache(maxsize=5)
13
+ def get_pyltp_tokenizer():
14
+ global ltp_data_dir
15
+
16
+ cws_model_path = os.path.join(ltp_data_dir, "cws.model")
17
+
18
+ segmentor = Segmentor(cws_model_path)
19
+
20
+ return segmentor
21
+
22
+
23
+ def pyltp_tokenize(text: str, language: str) -> List[str]:
24
+ segmentor = get_pyltp_tokenizer()
25
+
26
+ words = segmentor.segment(text)
27
+
28
+ return words
29
+
30
+
31
+ if __name__ == "__main__":
32
+ pass
toolbox/tokenization/tokenization.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from typing import Callable, Dict, List
4
+
5
+ from toolbox.tokenization.pyltp_tokenization import pyltp_tokenize
6
+
7
+
8
+ language_to_engines = {
9
+ "chinese": ["pyltp"]
10
+ }
11
+
12
+
13
+ engine_to_tagger: Dict[str, Callable] = {
14
+ "pyltp": pyltp_tokenize
15
+ }
16
+
17
+
18
+ def tokenize(text: str, language: str, engine: str) -> List[str]:
19
+ tokenizer = engine_to_tagger.get(engine)
20
+ if tokenizer is None:
21
+ raise AssertionError(f"engine {engine} not supported.")
22
+
23
+ words = tokenizer(text, language)
24
+ return words
25
+
26
+
27
+ if __name__ == "__main__":
28
+ pass