Spaces:

intelli-zen
/

part_of_speech

Running

App Files Files Community

HoneyTian commited on Aug 23

Commit

5839f86

•

1 Parent(s): 845e414

update

Browse files

Files changed (5) hide show

main.py +72 -0
token_examples.json +3 -0
toolbox/tokenization/__init__.py +6 -0
toolbox/tokenization/pyltp_tokenization.py +32 -0
toolbox/tokenization/tokenization.py +28 -0

main.py CHANGED Viewed

@@ -36,6 +36,11 @@ from toolbox.sementic_role_labeling.sementic_role_labeling import (
     engine_to_tagger as srl_engine_to_tagger,
     srl
 )
 main_logger = logging.getLogger("main")
@@ -58,6 +63,11 @@ def get_args():
         default=(project_path / "srl_examples.json").as_posix(),
         type=str
     )
     args = parser.parse_args()
     return args
@@ -129,6 +139,25 @@ def run_srl(text: str, language: str, engine: str) -> str:
         return result
 def shell(cmd: str):
     return Command.popen(cmd)
@@ -142,6 +171,8 @@ def main():
         pos_examples: list = json.load(f)
     with open(args.srl_example_json_file, "r", encoding="utf-8") as f:
         srl_examples: list = json.load(f)
     # blocks
     with gr.Blocks() as blocks:
@@ -273,6 +304,47 @@ def main():
                     fn=run_ner,
                 )
             with gr.TabItem("shell"):
                 shell_text = gr.Textbox(label="cmd")
                 shell_button = gr.Button("run")

     engine_to_tagger as srl_engine_to_tagger,
     srl
 )
+from toolbox.tokenization.tokenization import (
+    language_to_engines as t_language_to_engines,
+    engine_to_tagger as t_engine_to_tagger,
+    tokenize
+)
 main_logger = logging.getLogger("main")
         default=(project_path / "srl_examples.json").as_posix(),
         type=str
     )
+    parser.add_argument(
+        "--token_example_json_file",
+        default=(project_path / "token_examples.json").as_posix(),
+        type=str
+    )
     args = parser.parse_args()
     return args
         return result
+def run_tokenization(text: str, language: str, engine: str) -> str:
+    try:
+        main_logger.info(f"tokenization started. text: {text}, language: {language}, engine: {engine}")
+        begin = time.time()
+        words = tokenize(text, language, engine)
+        result = ""
+        for word in words:
+            result += f"{word}\t"
+        time_cost = time.time() - begin
+        result += f"\n\ntime_cost: {round(time_cost, 4)}"
+        return result
+    except Exception as e:
+        result = f"{type(e)}\n{str(e)}"
+        return result
 def shell(cmd: str):
     return Command.popen(cmd)
         pos_examples: list = json.load(f)
     with open(args.srl_example_json_file, "r", encoding="utf-8") as f:
         srl_examples: list = json.load(f)
+    with open(args.token_example_json_file, "r", encoding="utf-8") as f:
+        token_examples: list = json.load(f)
     # blocks
     with gr.Blocks() as blocks:
                     fn=run_ner,
                 )
+            with gr.TabItem("tokenization"):
+                def t_get_languages_by_engine(engine: str):
+                    language_list = list()
+                    for k, v in t_language_to_engines.items():
+                        if engine in v:
+                            language_list.append(k)
+                    return gr.Dropdown(choices=language_list, value=language_list[0], label="language")
+                t_language_choices = list(t_language_to_engines.keys())
+                t_engine_choices = list(t_engine_to_tagger.keys())
+                t_text = gr.Textbox(value="学而时习之，不亦悦乎。", lines=4, max_lines=50, label="text")
+                with gr.Row():
+                    t_language = gr.Dropdown(
+                        choices=t_language_choices, value=t_language_choices[0],
+                        label="language"
+                    )
+                    t_engine = gr.Dropdown(
+                        choices=t_engine_choices, value=t_engine_choices[0],
+                        label="engine"
+                    )
+                t_engine.change(
+                    t_get_languages_by_engine,
+                    inputs=[t_engine],
+                    outputs=[t_language],
+                )
+                t_output = gr.Textbox(lines=4, max_lines=50, label="output")
+                t_button = gr.Button(value="pos_tag", variant="primary")
+                t_button.click(
+                    run_tokenization,
+                    inputs=[t_text, t_language, t_engine],
+                    outputs=[t_output],
+                )
+                gr.Examples(
+                    examples=token_examples,
+                    inputs=[t_text, t_language, t_engine],
+                    outputs=[t_output],
+                    fn=run_tokenization,
+                )
             with gr.TabItem("shell"):
                 shell_text = gr.Textbox(label="cmd")
                 shell_button = gr.Button("run")

token_examples.json ADDED Viewed

	@@ -0,0 +1,3 @@

+[
+  ["元芳你怎么看？我就趴窗口上看呗！", "chinese", "pyltp"]
+]

toolbox/tokenization/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == "__main__":
+    pass

toolbox/tokenization/pyltp_tokenization.py ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from functools import lru_cache
+import os
+from typing import List
+ltp_data_dir = os.environ.get("LTP_DATA_DIR")
+from pyltp import Segmentor
+@lru_cache(maxsize=5)
+def get_pyltp_tokenizer():
+    global ltp_data_dir
+    cws_model_path = os.path.join(ltp_data_dir, "cws.model")
+    segmentor = Segmentor(cws_model_path)
+    return segmentor
+def pyltp_tokenize(text: str, language: str) -> List[str]:
+    segmentor = get_pyltp_tokenizer()
+    words = segmentor.segment(text)
+    return words
+if __name__ == "__main__":
+    pass

toolbox/tokenization/tokenization.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Callable, Dict, List
+from toolbox.tokenization.pyltp_tokenization import pyltp_tokenize
+language_to_engines = {
+    "chinese": ["pyltp"]
+}
+engine_to_tagger: Dict[str, Callable] = {
+    "pyltp": pyltp_tokenize
+}
+def tokenize(text: str, language: str, engine: str) -> List[str]:
+    tokenizer = engine_to_tagger.get(engine)
+    if tokenizer is None:
+        raise AssertionError(f"engine {engine} not supported.")
+    words = tokenizer(text, language)
+    return words
+if __name__ == "__main__":
+    pass