Spaces:
Runtime error
Runtime error
import subprocess | |
subprocess.run(["pip", "install", "spacy"]) | |
import spacy | |
#spacy.cli.download("en_core_web_sm") | |
#from spacy.tokens import Doc | |
# 加载英文模型 | |
#nlp = spacy.load('en_core_web_sm') | |
import nltk | |
#nltk.download('punkt') | |
#from nltk.tokenize import word_tokenize | |
import jieba | |
from sacremoses import MosesTokenizer | |
from subword_nmt import apply_bpe | |
import codecs | |
jieba1 = jieba.Tokenizer() | |
jieba2 = jieba.Tokenizer() | |
jieba2.load_userdict('model2_data/dict.zh.txt') | |
mt_zh = MosesTokenizer(lang='zh') | |
with codecs.open('model2_data/bpecode.zh', 'r', 'utf-8') as f: | |
bpe_zh_f = apply_bpe.BPE(f) | |
#英文部分初始化,定义tokenize等等 | |
mt_en = MosesTokenizer(lang='en') | |
with codecs.open('model2_data/bpecode.en', 'r', 'utf-8') as f: | |
bpe_en_f = apply_bpe.BPE(f) | |
def spacy_tokenize(line): | |
# 使用spaCy处理文本 | |
#doc = nlp(line) | |
# 获取单词列表 | |
#words = [token.text for token in doc] | |
# 将单词连接成一个字符串,单词间用一个空格间隔 | |
#return ' '.join(words) | |
return "" | |
def nltk_tokenize(line): | |
# 使用NLTK的word_tokenize进行分词 | |
#tokens = word_tokenize(line) | |
#print(tokens) | |
#return tokens | |
return [] | |
def jieba_tokenize(line): | |
# 使用jieba进行分词 | |
tokens = list(jieba1.cut(line.strip())) # strip用于去除可能的空白字符 | |
#print(tokens) | |
return tokens | |
def tokenize(line, mode): | |
if mode == "汉译英" : | |
return jieba_tokenize(line) | |
else : | |
return nltk_tokenize(spacy_tokenize(line)) | |
def jieba_tokenize2(line): | |
tokens = list(jieba2.cut(line.strip())) | |
return tokens | |
def mt_bpe_zh(line): | |
zh_tok = mt_zh.tokenize(line) | |
bpe_zh = bpe_zh_f.segment_tokens(zh_tok) | |
print(bpe_zh) | |
return bpe_zh | |
def mt_bpe_en(line): | |
en_tok = mt_en.tokenize(line) | |
bpe_en = bpe_en_f.segment_tokens(en_tok) | |
print(bpe_en) | |
return bpe_en | |
def tokenize2(line, mode): | |
if mode == "汉译英" : | |
return mt_bpe_zh(' '.join(jieba_tokenize2(line))) | |
else : | |
return mt_bpe_en(line) |