import nltk import jieba import sudachipy import langid nltk.download('punkt') langid.set_languages(['en', 'zh', 'ja']) def split_text_into_sentences(text): if langid.classify(text)[0] == "en": sentences = nltk.tokenize.sent_tokenize(text) return sentences elif langid.classify(text)[0] == "zh": sentences = [] segs = jieba.cut(text, cut_all=False) segs = list(segs) start = 0 for i, seg in enumerate(segs): if seg in ["。", "!", "?", "……"]: sentences.append("".join(segs[start:i + 1])) start = i + 1 if start < len(segs): sentences.append("".join(segs[start:])) return sentences elif langid.classify(text)[0] == "ja": sentences = [] tokenizer = sudachipy.Dictionary().create() tokens = tokenizer.tokenize(text) current_sentence = "" for token in tokens: current_sentence += token.surface() if token.part_of_speech()[0] == "補助記号" and token.part_of_speech()[1] == "句点": sentences.append(current_sentence) current_sentence = "" if current_sentence: sentences.append(current_sentence) return sentences raise RuntimeError("It is impossible to reach here.")