part_of_speech / main.py
HoneyTian's picture
update
5839f86
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import asyncio
import logging
import json
import os
import platform
import time
from project_settings import project_path
os.environ["NLTK_DATA"] = (project_path / "data/nltk_data").as_posix()
os.environ["LTP_DATA_DIR"] = (project_path / "data/pyltp_models/ltp_data_v3.4.0").as_posix()
from project_settings import project_path, log_directory
import log
log.setup(log_directory=log_directory)
import gradio as gr
from toolbox.os.command import Command
from toolbox.named_entity_recognization.named_entity_recognization import (
language_to_engines as ner_language_to_engines,
engine_to_tagger as ner_engine_to_tagger,
ner
)
from toolbox.part_of_speech.part_of_speech import (
language_to_engines as pos_language_to_engines,
engine_to_tagger as pos_engine_to_tagger,
pos_tag
)
from toolbox.sementic_role_labeling.sementic_role_labeling import (
language_to_engines as srl_language_to_engines,
engine_to_tagger as srl_engine_to_tagger,
srl
)
from toolbox.tokenization.tokenization import (
language_to_engines as t_language_to_engines,
engine_to_tagger as t_engine_to_tagger,
tokenize
)
main_logger = logging.getLogger("main")
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--ner_example_json_file",
default=(project_path / "ner_examples.json").as_posix(),
type=str
)
parser.add_argument(
"--pos_example_json_file",
default=(project_path / "pos_examples.json").as_posix(),
type=str
)
parser.add_argument(
"--srl_example_json_file",
default=(project_path / "srl_examples.json").as_posix(),
type=str
)
parser.add_argument(
"--token_example_json_file",
default=(project_path / "token_examples.json").as_posix(),
type=str
)
args = parser.parse_args()
return args
def run_ner(text: str, language: str, engine: str) -> str:
try:
main_logger.info(f"ner started. text: {text}, language: {language}, engine: {engine}")
begin = time.time()
words, postags, ner_tags = ner(text, language, engine)
result = ""
for word, ner_tag in zip(words, ner_tags):
row = f"{word}/{ner_tag}"
result += f"{row}\t"
time_cost = time.time() - begin
result += f"\n\ntime_cost: {round(time_cost, 4)}"
return result
except Exception as e:
result = f"{type(e)}\n{str(e)}"
return result
def run_pos_tag(text: str, language: str, engine: str) -> str:
try:
main_logger.info(f"pos tag started. text: {text}, language: {language}, engine: {engine}")
begin = time.time()
words, postags = pos_tag(text, language, engine)
result = ""
for word, postag in zip(words, postags):
row = f"{word}/{postag}"
result += f"{row}\t"
time_cost = time.time() - begin
result += f"\n\ntime_cost: {round(time_cost, 4)}"
return result
except Exception as e:
result = f"{type(e)}\n{str(e)}"
return result
def run_srl(text: str, language: str, engine: str) -> str:
try:
main_logger.info(f"srl started. text: {text}, language: {language}, engine: {engine}")
begin = time.time()
words, postags, arcs, roles = srl(text, language, engine)
result = ""
for role in roles:
row = ""
for r in role:
name = r[0]
start = r[1][0]
end = r[1][1]
arg_text = "".join(words[start:end+1])
row += f"{arg_text}/{name}\t"
result += f"{row}\n"
time_cost = time.time() - begin
result += f"\n\ntime_cost: {round(time_cost, 4)}"
return result
except Exception as e:
result = f"{type(e)}\n{str(e)}"
return result
def run_tokenization(text: str, language: str, engine: str) -> str:
try:
main_logger.info(f"tokenization started. text: {text}, language: {language}, engine: {engine}")
begin = time.time()
words = tokenize(text, language, engine)
result = ""
for word in words:
result += f"{word}\t"
time_cost = time.time() - begin
result += f"\n\ntime_cost: {round(time_cost, 4)}"
return result
except Exception as e:
result = f"{type(e)}\n{str(e)}"
return result
def shell(cmd: str):
return Command.popen(cmd)
def main():
args = get_args()
with open(args.ner_example_json_file, "r", encoding="utf-8") as f:
ner_examples: list = json.load(f)
with open(args.pos_example_json_file, "r", encoding="utf-8") as f:
pos_examples: list = json.load(f)
with open(args.srl_example_json_file, "r", encoding="utf-8") as f:
srl_examples: list = json.load(f)
with open(args.token_example_json_file, "r", encoding="utf-8") as f:
token_examples: list = json.load(f)
# blocks
with gr.Blocks() as blocks:
gr.Markdown(value="## 词性标注.")
with gr.Tabs():
with gr.TabItem("part of speech"):
def pos_get_languages_by_engine(engine: str):
language_list = list()
for k, v in pos_language_to_engines.items():
if engine in v:
language_list.append(k)
return gr.Dropdown(choices=language_list, value=language_list[0], label="language")
pos_language_choices = list(pos_language_to_engines.keys())
pos_engine_choices = list(pos_engine_to_tagger.keys())
pos_text = gr.Textbox(value="学而时习之,不亦悦乎。", lines=4, max_lines=50, label="text")
with gr.Row():
pos_language = gr.Dropdown(
choices=pos_language_choices, value=pos_language_choices[0],
label="language"
)
pos_engine = gr.Dropdown(
choices=pos_engine_choices, value=pos_engine_choices[0],
label="engine"
)
pos_engine.change(
pos_get_languages_by_engine,
inputs=[pos_engine],
outputs=[pos_language],
)
pos_output = gr.Textbox(lines=4, max_lines=50, label="output")
pos_button = gr.Button(value="pos_tag", variant="primary")
pos_button.click(
run_pos_tag,
inputs=[pos_text, pos_language, pos_engine],
outputs=[pos_output],
)
gr.Examples(
examples=pos_examples,
inputs=[pos_text, pos_language, pos_engine],
outputs=[pos_output],
fn=run_pos_tag,
)
with gr.TabItem("srl"):
def srl_get_languages_by_engine(engine: str):
language_list = list()
for k, v in pos_language_to_engines.items():
if engine in v:
language_list.append(k)
return gr.Dropdown(choices=language_list, value=language_list[0], label="language")
srl_language_choices = list(srl_language_to_engines.keys())
srl_engine_choices = list(srl_engine_to_tagger.keys())
srl_text = gr.Textbox(value="学而时习之,不亦悦乎。", lines=4, max_lines=50, label="text")
with gr.Row():
srl_language = gr.Dropdown(
choices=srl_language_choices, value=srl_language_choices[0],
label="language"
)
srl_engine = gr.Dropdown(
choices=srl_engine_choices, value=srl_engine_choices[0],
label="engine"
)
srl_engine.change(
srl_get_languages_by_engine,
inputs=[srl_engine],
outputs=[srl_language],
)
srl_output = gr.Textbox(lines=4, max_lines=50, label="output")
srl_button = gr.Button(value="pos_tag", variant="primary")
srl_button.click(
run_srl,
inputs=[srl_text, srl_language, srl_engine],
outputs=[srl_output],
)
gr.Examples(
examples=srl_examples,
inputs=[srl_text, srl_language, srl_engine],
outputs=[srl_output],
fn=run_srl,
)
with gr.TabItem("ner"):
def ner_get_languages_by_engine(engine: str):
language_list = list()
for k, v in ner_language_to_engines.items():
if engine in v:
language_list.append(k)
return gr.Dropdown(choices=language_list, value=language_list[0], label="language")
ner_language_choices = list(ner_language_to_engines.keys())
ner_engine_choices = list(ner_engine_to_tagger.keys())
ner_text = gr.Textbox(value="学而时习之,不亦悦乎。", lines=4, max_lines=50, label="text")
with gr.Row():
ner_language = gr.Dropdown(
choices=ner_language_choices, value=ner_language_choices[0],
label="language"
)
ner_engine = gr.Dropdown(
choices=ner_engine_choices, value=ner_engine_choices[0],
label="engine"
)
ner_engine.change(
ner_get_languages_by_engine,
inputs=[ner_engine],
outputs=[ner_language],
)
ner_output = gr.Textbox(lines=4, max_lines=50, label="output")
ner_button = gr.Button(value="pos_tag", variant="primary")
ner_button.click(
run_ner,
inputs=[ner_text, ner_language, ner_engine],
outputs=[ner_output],
)
gr.Examples(
examples=ner_examples,
inputs=[ner_text, ner_language, ner_engine],
outputs=[ner_output],
fn=run_ner,
)
with gr.TabItem("tokenization"):
def t_get_languages_by_engine(engine: str):
language_list = list()
for k, v in t_language_to_engines.items():
if engine in v:
language_list.append(k)
return gr.Dropdown(choices=language_list, value=language_list[0], label="language")
t_language_choices = list(t_language_to_engines.keys())
t_engine_choices = list(t_engine_to_tagger.keys())
t_text = gr.Textbox(value="学而时习之,不亦悦乎。", lines=4, max_lines=50, label="text")
with gr.Row():
t_language = gr.Dropdown(
choices=t_language_choices, value=t_language_choices[0],
label="language"
)
t_engine = gr.Dropdown(
choices=t_engine_choices, value=t_engine_choices[0],
label="engine"
)
t_engine.change(
t_get_languages_by_engine,
inputs=[t_engine],
outputs=[t_language],
)
t_output = gr.Textbox(lines=4, max_lines=50, label="output")
t_button = gr.Button(value="pos_tag", variant="primary")
t_button.click(
run_tokenization,
inputs=[t_text, t_language, t_engine],
outputs=[t_output],
)
gr.Examples(
examples=token_examples,
inputs=[t_text, t_language, t_engine],
outputs=[t_output],
fn=run_tokenization,
)
with gr.TabItem("shell"):
shell_text = gr.Textbox(label="cmd")
shell_button = gr.Button("run")
shell_output = gr.Textbox(label="output")
shell_button.click(shell, inputs=[shell_text,], outputs=[shell_output])
blocks.queue().launch(
share=False if platform.system() == "Windows" else False,
server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
server_port=7860,
)
return
if __name__ == "__main__":
main()