HoneyTian commited on
Commit
e778824
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ .git/
3
+ .idea/
4
+
5
+ /data/
6
+ /dotenv/
7
+ /logs/
8
+ /trained_models
9
+ /temp/
10
+
11
+ **/__pycache__/
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ RUN useradd -m -u 1000 user
4
+
5
+ WORKDIR /app
6
+
7
+ COPY --chown=user ./requirements.txt requirements.txt
8
+
9
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
10
+
11
+ COPY --chown=user . /app
12
+
13
+ RUN bash install.sh --stage 0 --stop_stage 2
14
+
15
+ CMD ["python3", "main.py"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Part Of Speech
3
+ emoji: 😻
4
+ colorFrom: blue
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
examples/tutorial_pyltp/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ## pyltp
2
+
3
+ ```text
4
+ 工程路径:
5
+ https://github.com/HuangFJ/pyltp
6
+
7
+ 模型文件:
8
+ https://ltp.ai/download.html
9
+
10
+ ```
examples/tutorial_pyltp/parser.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os
5
+ import time
6
+
7
+ from project_settings import project_path
8
+
9
+ os.environ['NLTK_DATA'] = (project_path / "data/nltk_data").as_posix()
10
+
11
+ from nltk import DependencyGraph
12
+ from pyltp import Segmentor
13
+ from pyltp import Parser
14
+ from pyltp import Postagger
15
+ from pyltp import SementicRoleLabeller
16
+
17
+
18
+ def get_args():
19
+ parser = argparse.ArgumentParser()
20
+
21
+ parser.add_argument(
22
+ "--text",
23
+ default="元芳你怎么看?我就趴窗口上看呗!",
24
+ # default="集中竞价的方式回购公司股份",
25
+ # default=",全中国都是我的",
26
+ # default="可以啊可以",
27
+ # default="我们是免费办理的, 不会收取任何手续费和服务费, 随借随还, 可以留个备用,您看可以吗?",
28
+ # default="。啊不用不用挂断 你这个昨天来过电话 你哪有打不通",
29
+ # default="利息怎么算",
30
+ type=str
31
+ )
32
+ parser.add_argument(
33
+ "--ltp_data_dir",
34
+ default=(project_path / "data/pyltp_models/ltp_data_v3.4.0").as_posix(),
35
+ type=str
36
+ )
37
+ args = parser.parse_args()
38
+ return args
39
+
40
+
41
+ def main():
42
+ """
43
+ 句法分析, 并画出句法树
44
+
45
+ 参考链接:
46
+ https://www.freesion.com/article/91401299576/
47
+ """
48
+ args = get_args()
49
+
50
+ cws_model_path = os.path.join(args.ltp_data_dir, 'cws.model')
51
+ pos_model_path = os.path.join(args.ltp_data_dir, 'pos.model')
52
+ parser_model_path = os.path.join(args.ltp_data_dir, 'parser.model')
53
+ srl_model_path = os.path.join(args.ltp_data_dir, 'pisrl_win.model')
54
+
55
+ segmentor = Segmentor(cws_model_path)
56
+ pos_tagger = Postagger(pos_model_path)
57
+ parser = Parser(parser_model_path)
58
+ srl_labeler = SementicRoleLabeller(srl_model_path)
59
+
60
+ time_begin = time.time()
61
+
62
+ words = segmentor.segment(args.text)
63
+ words_ = [word for word in words]
64
+ print(words_)
65
+
66
+ postags = pos_tagger.postag(words)
67
+ postags_ = [postag for postag in postags]
68
+ print(postags_)
69
+
70
+ arcs = parser.parse(words, postags)
71
+
72
+ cost = time.time() - time_begin
73
+ print("cost: {}".format(cost))
74
+
75
+ tree_str = ""
76
+ for word, postag, arc in zip(words, postags, arcs):
77
+ head = arc[0]
78
+ relation = arc[1]
79
+ if head == 0:
80
+ relation = "ROOT"
81
+
82
+ line = """\t{word}({relation}/{postag})\t{postag}\t{head}\t{relation}\n""".format(
83
+ word=word,
84
+ relation=relation,
85
+ postag=postag,
86
+ head=head,
87
+ )
88
+ tree_str += line
89
+
90
+ print(tree_str)
91
+ conlltree = DependencyGraph(tree_str=tree_str)
92
+ tree = conlltree.tree()
93
+ tree.draw()
94
+
95
+ segmentor.release()
96
+ pos_tagger.release()
97
+ parser.release()
98
+ return
99
+
100
+
101
+ if __name__ == '__main__':
102
+ main()
examples/tutorial_pyltp/part_of_speech.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://huggingface.co/LTP
5
+ """
6
+ import argparse
7
+ import os
8
+
9
+ from pyltp import Segmentor
10
+ from pyltp import Postagger
11
+
12
+ from project_settings import project_path
13
+
14
+
15
+ def get_args():
16
+ parser = argparse.ArgumentParser()
17
+
18
+ parser.add_argument(
19
+ "--text",
20
+ default="元芳你怎么看?我就趴窗口上看呗!",
21
+ type=str
22
+ )
23
+ parser.add_argument(
24
+ "--ltp_data_dir",
25
+ default=(project_path / "data/pyltp_models/ltp_data_v3.4.0").as_posix(),
26
+ type=str
27
+ )
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def main():
33
+ args = get_args()
34
+
35
+ cws_model_path = os.path.join(args.ltp_data_dir, "cws.model")
36
+ pos_model_path = os.path.join(args.ltp_data_dir, "pos.model")
37
+
38
+ segmentor = Segmentor(cws_model_path)
39
+ postagger = Postagger(pos_model_path)
40
+
41
+ words = segmentor.segment(args.text)
42
+ postags = postagger.postag(words)
43
+ print(words)
44
+ print(postags)
45
+
46
+ segmentor.release()
47
+ postagger.release()
48
+
49
+ return
50
+
51
+
52
+ if __name__ == "__main__":
53
+ main()
examples/tutorial_pyltp/sentence_splitter.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ from pyltp import SentenceSplitter
6
+
7
+
8
+ def get_args():
9
+ parser = argparse.ArgumentParser()
10
+
11
+ parser.add_argument(
12
+ "--text",
13
+ default="元芳你怎么看?我就趴窗口上看呗!",
14
+ type=str
15
+ )
16
+ args = parser.parse_args()
17
+ return args
18
+
19
+
20
+ def main():
21
+ args = get_args()
22
+
23
+ sent_list = SentenceSplitter.split(args.text)
24
+ print(sent_list)
25
+ return
26
+
27
+
28
+ if __name__ == "__main__":
29
+ main()
examples/tutorial_pyltp/tokenization.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://huggingface.co/LTP
5
+ """
6
+ import argparse
7
+ import os
8
+
9
+ from pyltp import Segmentor
10
+
11
+ from project_settings import project_path
12
+
13
+
14
+ def get_args():
15
+ parser = argparse.ArgumentParser()
16
+
17
+ parser.add_argument(
18
+ "--text",
19
+ default="元芳你怎么看?我就趴窗口上看呗!",
20
+ type=str
21
+ )
22
+ parser.add_argument(
23
+ "--ltp_data_dir",
24
+ default=(project_path / "data/pyltp_models/ltp_data_v3.4.0").as_posix(),
25
+ type=str
26
+ )
27
+ args = parser.parse_args()
28
+ return args
29
+
30
+
31
+ def main():
32
+ args = get_args()
33
+
34
+ cws_model_path = os.path.join(args.ltp_data_dir, "cws.model")
35
+
36
+ segmentor = Segmentor(cws_model_path)
37
+
38
+ words = segmentor.segment(args.text)
39
+ print(words)
40
+
41
+ segmentor.release()
42
+
43
+ return
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
install.sh ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # bash install.sh --stage 0 --stop_stage 2
4
+ # bash install.sh --stage 1 --stop_stage 1
5
+
6
+ verbose=true;
7
+ stage=-1
8
+ stop_stage=0
9
+
10
+
11
+ # parse options
12
+ while true; do
13
+ [ -z "${1:-}" ] && break; # break if there are no arguments
14
+ case "$1" in
15
+ --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
16
+ eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
17
+ old_value="(eval echo \\$$name)";
18
+ if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
19
+ was_bool=true;
20
+ else
21
+ was_bool=false;
22
+ fi
23
+
24
+ # Set the variable to the right value-- the escaped quotes make it work if
25
+ # the option had spaces, like --cmd "queue.pl -sync y"
26
+ eval "${name}=\"$2\"";
27
+
28
+ # Check that Boolean-valued arguments are really Boolean.
29
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
30
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
31
+ exit 1;
32
+ fi
33
+ shift 2;
34
+ ;;
35
+
36
+ *) break;
37
+ esac
38
+ done
39
+
40
+ work_dir="$(pwd)"
41
+ data_dir="$(pwd)/data"
42
+ nltk_data_dir="${data_dir}/nltk_data"
43
+ nltk_data_tokenizers_dir="${nltk_data_dir}/tokenizers"
44
+ pyltp_models_dir="${data_dir}/pyltp_models"
45
+
46
+ mkdir -p "${data_dir}"
47
+ mkdir -p "${nltk_data_dir}"
48
+ mkdir -p "${nltk_data_tokenizers_dir}"
49
+ mkdir -p "${pyltp_models_dir}"
50
+
51
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
52
+ $verbose && echo "stage 0: download nltk data punkt"
53
+ cd "${nltk_data_tokenizers_dir}" || exit 1;
54
+
55
+ # https://www.nltk.org/nltk_data/
56
+ if [ ! -d "punkt" ]; then
57
+ # nltk==3.8.1
58
+ wget -c https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip
59
+ unzip punkt.zip
60
+ rm punkt.zip
61
+ fi
62
+
63
+ if [ ! -d "punkt_tab" ]; then
64
+ # nltk==3.8.2
65
+ wget -c https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip
66
+ unzip punkt_tab.zip
67
+ rm punkt_tab.zip
68
+ fi
69
+ fi
70
+
71
+
72
+ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
73
+ $verbose && echo "stage 0: download pyltp models"
74
+ cd "${pyltp_models_dir}" || exit 1;
75
+
76
+ # pyltp
77
+ # https://ltp.ai/download.html
78
+ if [ ! -e "resources.json" ]; then
79
+ wget -c http://model.scir.yunfutech.com/model/ltp_data_v3.4.0.zip
80
+ unzip ltp_data_v3.4.0.zip
81
+ rm ltp_data_v3.4.0.zip
82
+ fi
83
+
84
+ fi
log.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import logging
4
+ from logging.handlers import TimedRotatingFileHandler
5
+ import os
6
+
7
+
8
+ def setup(log_directory: str):
9
+ fmt = "%(asctime)s - %(name)s - %(levelname)s %(filename)s:%(lineno)d > %(message)s"
10
+
11
+ stream_handler = logging.StreamHandler()
12
+ stream_handler.setLevel(logging.INFO)
13
+ stream_handler.setFormatter(logging.Formatter(fmt))
14
+
15
+ # main
16
+ main_logger = logging.getLogger("main")
17
+ main_logger.addHandler(stream_handler)
18
+ main_info_file_handler = TimedRotatingFileHandler(
19
+ filename=os.path.join(log_directory, "main.log"),
20
+ encoding="utf-8",
21
+ when="midnight",
22
+ interval=1,
23
+ backupCount=30
24
+ )
25
+ main_info_file_handler.setLevel(logging.INFO)
26
+ main_info_file_handler.setFormatter(logging.Formatter(fmt))
27
+ main_logger.addHandler(main_info_file_handler)
28
+
29
+ # http
30
+ http_logger = logging.getLogger("http")
31
+ http_file_handler = TimedRotatingFileHandler(
32
+ filename=os.path.join(log_directory, "http.log"),
33
+ encoding='utf-8',
34
+ when="midnight",
35
+ interval=1,
36
+ backupCount=30
37
+ )
38
+ http_file_handler.setLevel(logging.DEBUG)
39
+ http_file_handler.setFormatter(logging.Formatter(fmt))
40
+ http_logger.addHandler(http_file_handler)
41
+
42
+ # api
43
+ api_logger = logging.getLogger("api")
44
+ api_file_handler = TimedRotatingFileHandler(
45
+ filename=os.path.join(log_directory, "api.log"),
46
+ encoding='utf-8',
47
+ when="midnight",
48
+ interval=1,
49
+ backupCount=30
50
+ )
51
+ api_file_handler.setLevel(logging.DEBUG)
52
+ api_file_handler.setFormatter(logging.Formatter(fmt))
53
+ api_logger.addHandler(api_file_handler)
54
+
55
+ # alarm
56
+ alarm_logger = logging.getLogger("alarm")
57
+ alarm_file_handler = TimedRotatingFileHandler(
58
+ filename=os.path.join(log_directory, "alarm.log"),
59
+ encoding="utf-8",
60
+ when="midnight",
61
+ interval=1,
62
+ backupCount=30
63
+ )
64
+ alarm_file_handler.setLevel(logging.DEBUG)
65
+ alarm_file_handler.setFormatter(logging.Formatter(fmt))
66
+ alarm_logger.addHandler(alarm_file_handler)
67
+
68
+ debug_file_handler = TimedRotatingFileHandler(
69
+ filename=os.path.join(log_directory, "debug.log"),
70
+ encoding="utf-8",
71
+ when="D",
72
+ interval=1,
73
+ backupCount=7
74
+ )
75
+ debug_file_handler.setLevel(logging.DEBUG)
76
+ debug_file_handler.setFormatter(logging.Formatter(fmt))
77
+
78
+ info_file_handler = TimedRotatingFileHandler(
79
+ filename=os.path.join(log_directory, "info.log"),
80
+ encoding="utf-8",
81
+ when="D",
82
+ interval=1,
83
+ backupCount=7
84
+ )
85
+ info_file_handler.setLevel(logging.INFO)
86
+ info_file_handler.setFormatter(logging.Formatter(fmt))
87
+
88
+ error_file_handler = TimedRotatingFileHandler(
89
+ filename=os.path.join(log_directory, "error.log"),
90
+ encoding="utf-8",
91
+ when="D",
92
+ interval=1,
93
+ backupCount=7
94
+ )
95
+ error_file_handler.setLevel(logging.ERROR)
96
+ error_file_handler.setFormatter(logging.Formatter(fmt))
97
+
98
+ logging.basicConfig(
99
+ level=logging.DEBUG,
100
+ datefmt="%a, %d %b %Y %H:%M:%S",
101
+ handlers=[
102
+ debug_file_handler,
103
+ info_file_handler,
104
+ error_file_handler,
105
+ ]
106
+ )
107
+
108
+
109
+ if __name__ == "__main__":
110
+ pass
main.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import asyncio
5
+ import logging
6
+ import json
7
+ import os
8
+ import platform
9
+ import time
10
+
11
+ from project_settings import project_path
12
+
13
+ os.environ["NLTK_DATA"] = (project_path / "data/nltk_data").as_posix()
14
+ os.environ["LTP_DATA_DIR"] = (project_path / "data/pyltp_models/ltp_data_v3.4.0").as_posix()
15
+
16
+ from project_settings import project_path, log_directory
17
+ import log
18
+
19
+ log.setup(log_directory=log_directory)
20
+
21
+ import gradio as gr
22
+
23
+ from toolbox.os.command import Command
24
+ from toolbox.part_of_speech.part_of_speech import language_to_engines, engine_to_tagger, pos_parser
25
+
26
+ main_logger = logging.getLogger("main")
27
+
28
+
29
+ def get_args():
30
+ parser = argparse.ArgumentParser()
31
+
32
+ parser.add_argument(
33
+ "--pos_example_json_file",
34
+ default=(project_path / "pos_examples.json").as_posix(),
35
+ type=str
36
+ )
37
+ args = parser.parse_args()
38
+ return args
39
+
40
+
41
+ def pos_tag(text: str, language: str, engine: str) -> str:
42
+ begin = time.time()
43
+
44
+ try:
45
+ words, postags = pos_parser(text, engine)
46
+ result = ""
47
+ for word, postag in zip(words, postags):
48
+ row = f"{word}/{postag}"
49
+ result += f"{row}\t"
50
+
51
+ time_cost = time.time() - begin
52
+ result += f"\n\ntime_cost: {round(time_cost, 4)}"
53
+ return result
54
+ except Exception as e:
55
+ result = f"{type(e)}\n{str(e)}"
56
+ return result
57
+
58
+
59
+ def shell(cmd: str):
60
+ return Command.popen(cmd)
61
+
62
+
63
+ def main():
64
+ args = get_args()
65
+
66
+ with open(args.pos_example_json_file, "r", encoding="utf-8") as f:
67
+ pos_examples: list = json.load(f)
68
+
69
+ def get_languages_by_engine(engine: str):
70
+ language_list = list()
71
+ for k, v in language_to_engines.items():
72
+ if engine in v:
73
+ language_list.append(k)
74
+ return gr.Dropdown(choices=language_list, value=language_list[0], label="language")
75
+
76
+ pos_language_choices = list(language_to_engines.keys())
77
+ pos_engine_choices = list(engine_to_tagger.keys())
78
+
79
+ # blocks
80
+ with gr.Blocks() as blocks:
81
+ gr.Markdown(value="## 词性标注.")
82
+
83
+ with gr.Tabs():
84
+ with gr.TabItem("part of speech"):
85
+ pos_text = gr.Textbox(value="学而时习之,不亦悦乎。", lines=4, max_lines=50, label="text")
86
+
87
+ with gr.Row():
88
+ pos_language = gr.Dropdown(
89
+ choices=pos_language_choices, value=pos_language_choices[0],
90
+ label="language"
91
+ )
92
+ pos_engine = gr.Dropdown(
93
+ choices=pos_engine_choices, value=pos_engine_choices[0],
94
+ label="engine"
95
+ )
96
+
97
+ pos_engine.change(
98
+ get_languages_by_engine,
99
+ inputs=[pos_engine],
100
+ outputs=[pos_language],
101
+ )
102
+ pos_output = gr.Textbox(lines=4, max_lines=50, label="output")
103
+ pos_button = gr.Button(value="pos_tag", variant="primary")
104
+ pos_button.click(
105
+ pos_tag,
106
+ inputs=[pos_text, pos_language, pos_engine],
107
+ outputs=[pos_output],
108
+ )
109
+
110
+ gr.Examples(
111
+ examples=pos_examples,
112
+ inputs=[pos_text, pos_language, pos_engine],
113
+ outputs=[pos_output],
114
+ fn=pos_tag,
115
+ )
116
+
117
+ with gr.TabItem("shell"):
118
+ shell_text = gr.Textbox(label="cmd")
119
+ shell_button = gr.Button("run")
120
+ shell_output = gr.Textbox(label="output")
121
+ shell_button.click(shell, inputs=[shell_text,], outputs=[shell_output])
122
+
123
+ blocks.queue().launch(
124
+ share=False if platform.system() == "Windows" else False,
125
+ server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
126
+ server_port=7860,
127
+ )
128
+ return
129
+
130
+
131
+ if __name__ == "__main__":
132
+ main()
pos_examples.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [
2
+ ["元芳你怎么看?我就趴窗口上看呗!", "chinese", "pyltp"]
3
+ ]
project_settings.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import os
4
+ from pathlib import Path
5
+
6
+
7
+ project_path = os.path.abspath(os.path.dirname(__file__))
8
+ project_path = Path(project_path)
9
+
10
+ log_directory = project_path / "logs"
11
+ log_directory.mkdir(parents=True, exist_ok=True)
12
+
13
+
14
+ if __name__ == '__main__':
15
+ pass
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ nltk==3.8.1
3
+ pyltp==0.4.0
toolbox/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == "__main__":
6
+ pass
toolbox/json/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/json/misc.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from typing import Callable
4
+
5
+
6
+ def traverse(js, callback: Callable, *args, **kwargs):
7
+ if isinstance(js, list):
8
+ result = list()
9
+ for l in js:
10
+ l = traverse(l, callback, *args, **kwargs)
11
+ result.append(l)
12
+ return result
13
+ elif isinstance(js, tuple):
14
+ result = list()
15
+ for l in js:
16
+ l = traverse(l, callback, *args, **kwargs)
17
+ result.append(l)
18
+ return tuple(result)
19
+ elif isinstance(js, dict):
20
+ result = dict()
21
+ for k, v in js.items():
22
+ k = traverse(k, callback, *args, **kwargs)
23
+ v = traverse(v, callback, *args, **kwargs)
24
+ result[k] = v
25
+ return result
26
+ elif isinstance(js, int):
27
+ return callback(js, *args, **kwargs)
28
+ elif isinstance(js, str):
29
+ return callback(js, *args, **kwargs)
30
+ else:
31
+ return js
32
+
33
+
34
+ def demo1():
35
+ d = {
36
+ "env": "ppe",
37
+ "mysql_connect": {
38
+ "host": "$mysql_connect_host",
39
+ "port": 3306,
40
+ "user": "callbot",
41
+ "password": "NxcloudAI2021!",
42
+ "database": "callbot_ppe",
43
+ "charset": "utf8"
44
+ },
45
+ "es_connect": {
46
+ "hosts": ["10.20.251.8"],
47
+ "http_auth": ["elastic", "ElasticAI2021!"],
48
+ "port": 9200
49
+ }
50
+ }
51
+
52
+ def callback(s):
53
+ if isinstance(s, str) and s.startswith('$'):
54
+ return s[1:]
55
+ return s
56
+
57
+ result = traverse(d, callback=callback)
58
+ print(result)
59
+ return
60
+
61
+
62
+ if __name__ == '__main__':
63
+ demo1()
toolbox/os/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/os/command.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import os
4
+
5
+
6
+ class Command(object):
7
+ custom_command = [
8
+ "cd"
9
+ ]
10
+
11
+ @staticmethod
12
+ def _get_cmd(command):
13
+ command = str(command).strip()
14
+ if command == "":
15
+ return None
16
+ cmd_and_args = command.split(sep=" ")
17
+ cmd = cmd_and_args[0]
18
+ args = " ".join(cmd_and_args[1:])
19
+ return cmd, args
20
+
21
+ @classmethod
22
+ def popen(cls, command):
23
+ cmd, args = cls._get_cmd(command)
24
+ if cmd in cls.custom_command:
25
+ method = getattr(cls, cmd)
26
+ return method(args)
27
+ else:
28
+ resp = os.popen(command)
29
+ result = resp.read()
30
+ resp.close()
31
+ return result
32
+
33
+ @classmethod
34
+ def cd(cls, args):
35
+ if args.startswith("/"):
36
+ os.chdir(args)
37
+ else:
38
+ pwd = os.getcwd()
39
+ path = os.path.join(pwd, args)
40
+ os.chdir(path)
41
+
42
+ @classmethod
43
+ def system(cls, command):
44
+ return os.system(command)
45
+
46
+ def __init__(self):
47
+ pass
48
+
49
+
50
+ def ps_ef_grep(keyword: str):
51
+ cmd = "ps -ef | grep {}".format(keyword)
52
+ rows = Command.popen(cmd)
53
+ rows = str(rows).split("\n")
54
+ rows = [row for row in rows if row.__contains__(keyword) and not row.__contains__("grep")]
55
+ return rows
56
+
57
+
58
+ if __name__ == "__main__":
59
+ pass
toolbox/os/environment.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import json
4
+ import os
5
+
6
+ from dotenv import load_dotenv
7
+ from dotenv.main import DotEnv
8
+
9
+ from toolbox.json.misc import traverse
10
+
11
+
12
+ class EnvironmentManager(object):
13
+ def __init__(self, path, env, override=False):
14
+ filename = os.path.join(path, '{}.env'.format(env))
15
+ self.filename = filename
16
+
17
+ load_dotenv(
18
+ dotenv_path=filename,
19
+ override=override
20
+ )
21
+
22
+ self._environ = dict()
23
+
24
+ def open_dotenv(self, filename: str = None):
25
+ filename = filename or self.filename
26
+ dotenv = DotEnv(
27
+ dotenv_path=filename,
28
+ stream=None,
29
+ verbose=False,
30
+ interpolate=False,
31
+ override=False,
32
+ encoding="utf-8",
33
+ )
34
+ result = dotenv.dict()
35
+ return result
36
+
37
+ def get(self, key, default=None, dtype=str):
38
+ result = os.environ.get(key)
39
+ if result is None:
40
+ if default is None:
41
+ result = None
42
+ else:
43
+ result = default
44
+ else:
45
+ result = dtype(result)
46
+ self._environ[key] = result
47
+ return result
48
+
49
+
50
+ _DEFAULT_DTYPE_MAP = {
51
+ 'int': int,
52
+ 'float': float,
53
+ 'str': str,
54
+ 'json.loads': json.loads
55
+ }
56
+
57
+
58
+ class JsonConfig(object):
59
+ """
60
+ 将 json 中, 形如 `$float:threshold` 的值, 处理为:
61
+ 从环境变量中查到 threshold, 再将其转换为 float 类型.
62
+ """
63
+ def __init__(self, dtype_map: dict = None, environment: EnvironmentManager = None):
64
+ self.dtype_map = dtype_map or _DEFAULT_DTYPE_MAP
65
+ self.environment = environment or os.environ
66
+
67
+ def sanitize_by_filename(self, filename: str):
68
+ with open(filename, 'r', encoding='utf-8') as f:
69
+ js = json.load(f)
70
+
71
+ return self.sanitize_by_json(js)
72
+
73
+ def sanitize_by_json(self, js):
74
+ js = traverse(
75
+ js,
76
+ callback=self.sanitize,
77
+ environment=self.environment
78
+ )
79
+ return js
80
+
81
+ def sanitize(self, string, environment):
82
+ """支持 $ 符开始的, 环境变量配置"""
83
+ if isinstance(string, str) and string.startswith('$'):
84
+ dtype, key = string[1:].split(':')
85
+ dtype = self.dtype_map[dtype]
86
+
87
+ value = environment.get(key)
88
+ if value is None:
89
+ raise AssertionError('environment not exist. key: {}'.format(key))
90
+
91
+ value = dtype(value)
92
+ result = value
93
+ else:
94
+ result = string
95
+ return result
96
+
97
+
98
+ def demo1():
99
+ import json
100
+
101
+ from project_settings import project_path
102
+
103
+ environment = EnvironmentManager(
104
+ path=os.path.join(project_path, 'server/callbot_server/dotenv'),
105
+ env='dev',
106
+ )
107
+ init_scenes = environment.get(key='init_scenes', dtype=json.loads)
108
+ print(init_scenes)
109
+ print(environment._environ)
110
+ return
111
+
112
+
113
+ if __name__ == '__main__':
114
+ demo1()
toolbox/os/other.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import inspect
3
+
4
+
5
+ def pwd():
6
+ """你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
7
+ frame = inspect.stack()[1]
8
+ module = inspect.getmodule(frame[0])
9
+ return os.path.dirname(os.path.abspath(module.__file__))
toolbox/part_of_speech/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == "__main__":
6
+ pass
toolbox/part_of_speech/part_of_speech.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from typing import Callable, Dict, List, Tuple, Union
4
+
5
+ from toolbox.part_of_speech.pyltp_pos_tagger import pyltp_pos_tagger
6
+
7
+
8
+ language_to_engines = {
9
+ "chinese": ["pyltp"]
10
+ }
11
+
12
+
13
+ engine_to_tagger: Dict[str, Callable] = {
14
+ "pyltp": pyltp_pos_tagger
15
+ }
16
+
17
+
18
+ def pos_parser(text: str, language: str, engine: str = "pyltp") -> Union[List[str], List[str]]:
19
+ pos_tagger = engine_to_tagger.get(engine)
20
+ if pos_tagger is None:
21
+ raise AssertionError(f"engine {engine} not supported.")
22
+
23
+ words, postags = pos_tagger(text, language)
24
+ if len(words) != len(postags):
25
+ raise AssertionError
26
+ return words, postags
27
+
28
+
29
+ if __name__ == "__main__":
30
+ pass
toolbox/part_of_speech/pyltp_pos_tagger.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from functools import lru_cache
4
+ import os
5
+ from typing import List, Union
6
+
7
+ ltp_data_dir = os.environ.get("LTP_DATA_DIR")
8
+
9
+ from pyltp import Postagger, Segmentor
10
+
11
+
12
+ @lru_cache(maxsize=5)
13
+ def get_pyltp_pos_tagger():
14
+ global ltp_data_dir
15
+
16
+ cws_model_path = os.path.join(ltp_data_dir, "cws.model")
17
+ pos_model_path = os.path.join(ltp_data_dir, "pos.model")
18
+
19
+ segmentor = Segmentor(cws_model_path)
20
+ postagger = Postagger(pos_model_path)
21
+
22
+ return segmentor, postagger
23
+
24
+
25
+ def pyltp_pos_tagger(text: str, language: str) -> Union[List[str], List[str]]:
26
+ segmentor, postagger = get_pyltp_pos_tagger()
27
+
28
+ words = segmentor.segment(text)
29
+ postags = postagger.postag(words)
30
+ return words, postags
31
+
32
+
33
+ if __name__ == "__main__":
34
+ pass