Spaces:
Runtime error
Runtime error
Commit
•
ad8d0c1
0
Parent(s):
Duplicate from thomas-yanxin/LangChain-ChatLLM
Browse filesCo-authored-by: thomas Yan <[email protected]>
- .gitattributes +34 -0
- README.md +14 -0
- app.py +272 -0
- chatllm.py +159 -0
- chinese_text_splitter.py +24 -0
- nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +3 -0
- nltk_data/tokenizers/punkt/.DS_Store +0 -0
- nltk_data/tokenizers/punkt/PY3/README +98 -0
- nltk_data/tokenizers/punkt/PY3/czech.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/danish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/dutch.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/estonian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/finnish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/french.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/german.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/greek.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/italian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/malayalam.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/norwegian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/polish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/portuguese.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/russian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/slovene.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/spanish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/swedish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/turkish.pickle +3 -0
- nltk_data/tokenizers/punkt/README +98 -0
- nltk_data/tokenizers/punkt/czech.pickle +3 -0
- nltk_data/tokenizers/punkt/danish.pickle +3 -0
- nltk_data/tokenizers/punkt/dutch.pickle +3 -0
- nltk_data/tokenizers/punkt/english.pickle +3 -0
- nltk_data/tokenizers/punkt/estonian.pickle +3 -0
- nltk_data/tokenizers/punkt/finnish.pickle +3 -0
- nltk_data/tokenizers/punkt/french.pickle +3 -0
- nltk_data/tokenizers/punkt/german.pickle +3 -0
- nltk_data/tokenizers/punkt/greek.pickle +3 -0
- nltk_data/tokenizers/punkt/italian.pickle +3 -0
- nltk_data/tokenizers/punkt/malayalam.pickle +3 -0
- nltk_data/tokenizers/punkt/norwegian.pickle +3 -0
- nltk_data/tokenizers/punkt/polish.pickle +3 -0
- nltk_data/tokenizers/punkt/portuguese.pickle +3 -0
- nltk_data/tokenizers/punkt/russian.pickle +3 -0
- nltk_data/tokenizers/punkt/slovene.pickle +3 -0
- nltk_data/tokenizers/punkt/spanish.pickle +3 -0
- nltk_data/tokenizers/punkt/swedish.pickle +3 -0
- nltk_data/tokenizers/punkt/turkish.pickle +3 -0
- requirements.txt +19 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: LangChain ChatLLM
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.27.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
duplicated_from: thomas-yanxin/LangChain-ChatLLM
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import nltk
|
5 |
+
import sentence_transformers
|
6 |
+
import torch
|
7 |
+
from duckduckgo_search import ddg
|
8 |
+
from duckduckgo_search.utils import SESSION
|
9 |
+
from langchain.chains import RetrievalQA
|
10 |
+
from langchain.document_loaders import UnstructuredFileLoader
|
11 |
+
from langchain.embeddings import JinaEmbeddings
|
12 |
+
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
13 |
+
from langchain.prompts import PromptTemplate
|
14 |
+
from langchain.prompts.prompt import PromptTemplate
|
15 |
+
from langchain.vectorstores import FAISS
|
16 |
+
|
17 |
+
from chatllm import ChatLLM
|
18 |
+
from chinese_text_splitter import ChineseTextSplitter
|
19 |
+
|
20 |
+
nltk.data.path.append('./nltk_data')
|
21 |
+
|
22 |
+
embedding_model_dict = {
|
23 |
+
"ernie-tiny": "nghuyong/ernie-3.0-nano-zh",
|
24 |
+
"ernie-base": "nghuyong/ernie-3.0-base-zh",
|
25 |
+
"text2vec-base": "GanymedeNil/text2vec-base-chinese",
|
26 |
+
"ViT-B-32": 'ViT-B-32::laion2b-s34b-b79k'
|
27 |
+
}
|
28 |
+
|
29 |
+
llm_model_dict = {
|
30 |
+
"ChatGLM-6B-int8": "THUDM/chatglm-6b-int8",
|
31 |
+
"ChatGLM-6B-int4": "THUDM/chatglm-6b-int4",
|
32 |
+
"ChatGLM-6b-int4-qe": "THUDM/chatglm-6b-int4-qe",
|
33 |
+
"Minimax": "Minimax"
|
34 |
+
}
|
35 |
+
|
36 |
+
DEVICE = "cuda" if torch.cuda.is_available(
|
37 |
+
) else "mps" if torch.backends.mps.is_available() else "cpu"
|
38 |
+
|
39 |
+
|
40 |
+
def search_web(query):
|
41 |
+
|
42 |
+
SESSION.proxies = {
|
43 |
+
"http": f"socks5h://localhost:7890",
|
44 |
+
"https": f"socks5h://localhost:7890"
|
45 |
+
}
|
46 |
+
results = ddg(query)
|
47 |
+
web_content = ''
|
48 |
+
if results:
|
49 |
+
for result in results:
|
50 |
+
web_content += result['body']
|
51 |
+
return web_content
|
52 |
+
|
53 |
+
|
54 |
+
def load_file(filepath):
|
55 |
+
if filepath.lower().endswith(".pdf"):
|
56 |
+
loader = UnstructuredFileLoader(filepath)
|
57 |
+
textsplitter = ChineseTextSplitter(pdf=True)
|
58 |
+
docs = loader.load_and_split(textsplitter)
|
59 |
+
else:
|
60 |
+
loader = UnstructuredFileLoader(filepath, mode="elements")
|
61 |
+
textsplitter = ChineseTextSplitter(pdf=False)
|
62 |
+
docs = loader.load_and_split(text_splitter=textsplitter)
|
63 |
+
return docs
|
64 |
+
|
65 |
+
|
66 |
+
def init_knowledge_vector_store(embedding_model, filepath):
|
67 |
+
if embedding_model == "ViT-B-32":
|
68 |
+
jina_auth_token = os.getenv('jina_auth_token')
|
69 |
+
embeddings = JinaEmbeddings(
|
70 |
+
jina_auth_token=jina_auth_token,
|
71 |
+
model_name=embedding_model_dict[embedding_model])
|
72 |
+
else:
|
73 |
+
embeddings = HuggingFaceEmbeddings(
|
74 |
+
model_name=embedding_model_dict[embedding_model], )
|
75 |
+
embeddings.client = sentence_transformers.SentenceTransformer(
|
76 |
+
embeddings.model_name, device=DEVICE)
|
77 |
+
|
78 |
+
docs = load_file(filepath)
|
79 |
+
|
80 |
+
vector_store = FAISS.from_documents(docs, embeddings)
|
81 |
+
return vector_store
|
82 |
+
|
83 |
+
|
84 |
+
def get_knowledge_based_answer(query,
|
85 |
+
large_language_model,
|
86 |
+
vector_store,
|
87 |
+
VECTOR_SEARCH_TOP_K,
|
88 |
+
web_content,
|
89 |
+
history_len,
|
90 |
+
temperature,
|
91 |
+
top_p,
|
92 |
+
chat_history=[]):
|
93 |
+
if web_content:
|
94 |
+
prompt_template = f"""基于以下已知信息,简洁和专业的来回答用户的问题。
|
95 |
+
如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。
|
96 |
+
已知网络检索内容:{web_content}""" + """
|
97 |
+
已知内容:
|
98 |
+
{context}
|
99 |
+
问题:
|
100 |
+
{question}"""
|
101 |
+
else:
|
102 |
+
prompt_template = """基于以下已知信息,请简洁并专业地回答用户的问题。
|
103 |
+
如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"。不允许在答案中添加编造成分。另外,答案请使用中文。
|
104 |
+
|
105 |
+
已知内容:
|
106 |
+
{context}
|
107 |
+
|
108 |
+
问题:
|
109 |
+
{question}"""
|
110 |
+
prompt = PromptTemplate(template=prompt_template,
|
111 |
+
input_variables=["context", "question"])
|
112 |
+
chatLLM = ChatLLM()
|
113 |
+
chatLLM.history = chat_history[-history_len:] if history_len > 0 else []
|
114 |
+
if large_language_model == "Minimax":
|
115 |
+
chatLLM.model = 'Minimax'
|
116 |
+
else:
|
117 |
+
chatLLM.load_model(
|
118 |
+
model_name_or_path=llm_model_dict[large_language_model])
|
119 |
+
chatLLM.temperature = temperature
|
120 |
+
chatLLM.top_p = top_p
|
121 |
+
|
122 |
+
knowledge_chain = RetrievalQA.from_llm(
|
123 |
+
llm=chatLLM,
|
124 |
+
retriever=vector_store.as_retriever(
|
125 |
+
search_kwargs={"k": VECTOR_SEARCH_TOP_K}),
|
126 |
+
prompt=prompt)
|
127 |
+
knowledge_chain.combine_documents_chain.document_prompt = PromptTemplate(
|
128 |
+
input_variables=["page_content"], template="{page_content}")
|
129 |
+
|
130 |
+
knowledge_chain.return_source_documents = True
|
131 |
+
|
132 |
+
result = knowledge_chain({"query": query})
|
133 |
+
return result
|
134 |
+
|
135 |
+
|
136 |
+
def clear_session():
|
137 |
+
return '', None
|
138 |
+
|
139 |
+
|
140 |
+
def predict(input,
|
141 |
+
large_language_model,
|
142 |
+
embedding_model,
|
143 |
+
file_obj,
|
144 |
+
VECTOR_SEARCH_TOP_K,
|
145 |
+
history_len,
|
146 |
+
temperature,
|
147 |
+
top_p,
|
148 |
+
use_web,
|
149 |
+
history=None):
|
150 |
+
if history == None:
|
151 |
+
history = []
|
152 |
+
print(file_obj.name)
|
153 |
+
vector_store = init_knowledge_vector_store(embedding_model, file_obj.name)
|
154 |
+
if use_web == 'True':
|
155 |
+
web_content = search_web(query=input)
|
156 |
+
else:
|
157 |
+
web_content = ''
|
158 |
+
resp = get_knowledge_based_answer(
|
159 |
+
query=input,
|
160 |
+
large_language_model=large_language_model,
|
161 |
+
vector_store=vector_store,
|
162 |
+
VECTOR_SEARCH_TOP_K=VECTOR_SEARCH_TOP_K,
|
163 |
+
web_content=web_content,
|
164 |
+
chat_history=history,
|
165 |
+
history_len=history_len,
|
166 |
+
temperature=temperature,
|
167 |
+
top_p=top_p,
|
168 |
+
)
|
169 |
+
print(resp)
|
170 |
+
history.append((input, resp['result']))
|
171 |
+
return '', history, history
|
172 |
+
|
173 |
+
|
174 |
+
if __name__ == "__main__":
|
175 |
+
block = gr.Blocks()
|
176 |
+
with block as demo:
|
177 |
+
gr.Markdown("""<h1><center>LangChain-ChatLLM-Webui</center></h1>
|
178 |
+
<center><font size=3>
|
179 |
+
本项目基于LangChain和大型语言模型系列模型, 提供基于本地知识的自动问答应用. <br>
|
180 |
+
目前项目提供基于<a href='https://github.com/THUDM/ChatGLM-6B' target="_blank">ChatGLM-6B </a>系列、Minimax的LLM和包括text2vec-base-chinese、ernie-3.0-zh系列以及由<a href='https://cloud.jina.ai/user/inference' target="_blank">Jina</a>提供的ViT-B-32::laion2b-s34b-b79k等多个Embedding模型, 支持上传 txt、docx、md等文本格式文件. <br>
|
181 |
+
后续将提供更加多样化的LLM、Embedding和参数选项供用户尝试, 欢迎关注<a href='https://github.com/thomas-yanxin/LangChain-ChatGLM-Webui' target="_blank">Github地址</a>. <br>
|
182 |
+
本项目已内置开发者自己的key,用户无需输入自己的相关key. <br>
|
183 |
+
当然,更推荐您点击右上角的<strong>Duplicate this Space</strong>,将项目Fork到自己的Space中,保护个人隐私,且避免排队!
|
184 |
+
</center></font>
|
185 |
+
""")
|
186 |
+
with gr.Row():
|
187 |
+
with gr.Column(scale=1):
|
188 |
+
model_choose = gr.Accordion("模型选择")
|
189 |
+
with model_choose:
|
190 |
+
large_language_model = gr.Dropdown(
|
191 |
+
list(llm_model_dict.keys()),
|
192 |
+
label="large language model",
|
193 |
+
value="ChatGLM-6B-int4")
|
194 |
+
|
195 |
+
embedding_model = gr.Dropdown(list(
|
196 |
+
embedding_model_dict.keys()),
|
197 |
+
label="Embedding model",
|
198 |
+
value="text2vec-base")
|
199 |
+
|
200 |
+
file = gr.File(label='请上传知识库文件, 目前支持txt、docx、md格式',
|
201 |
+
file_types=['.txt', '.md', '.docx'])
|
202 |
+
|
203 |
+
use_web = gr.Radio(["True", "False"],
|
204 |
+
label="Web Search",
|
205 |
+
value="False")
|
206 |
+
model_argument = gr.Accordion("模型参数配置")
|
207 |
+
|
208 |
+
with model_argument:
|
209 |
+
|
210 |
+
VECTOR_SEARCH_TOP_K = gr.Slider(
|
211 |
+
1,
|
212 |
+
10,
|
213 |
+
value=6,
|
214 |
+
step=1,
|
215 |
+
label="vector search top k",
|
216 |
+
interactive=True)
|
217 |
+
|
218 |
+
HISTORY_LEN = gr.Slider(0,
|
219 |
+
3,
|
220 |
+
value=0,
|
221 |
+
step=1,
|
222 |
+
label="history len",
|
223 |
+
interactive=True)
|
224 |
+
|
225 |
+
temperature = gr.Slider(0,
|
226 |
+
1,
|
227 |
+
value=0.01,
|
228 |
+
step=0.01,
|
229 |
+
label="temperature",
|
230 |
+
interactive=True)
|
231 |
+
top_p = gr.Slider(0,
|
232 |
+
1,
|
233 |
+
value=0.9,
|
234 |
+
step=0.1,
|
235 |
+
label="top_p",
|
236 |
+
interactive=True)
|
237 |
+
|
238 |
+
with gr.Column(scale=4):
|
239 |
+
chatbot = gr.Chatbot(label='ChatLLM').style(height=600)
|
240 |
+
message = gr.Textbox(label='请输入问题')
|
241 |
+
state = gr.State()
|
242 |
+
|
243 |
+
with gr.Row():
|
244 |
+
clear_history = gr.Button("🧹 清除历史对话")
|
245 |
+
send = gr.Button("🚀 发送")
|
246 |
+
|
247 |
+
send.click(predict,
|
248 |
+
inputs=[
|
249 |
+
message, large_language_model,
|
250 |
+
embedding_model, file, VECTOR_SEARCH_TOP_K,
|
251 |
+
HISTORY_LEN, temperature, top_p, use_web,
|
252 |
+
state
|
253 |
+
],
|
254 |
+
outputs=[message, chatbot, state])
|
255 |
+
clear_history.click(fn=clear_session,
|
256 |
+
inputs=[],
|
257 |
+
outputs=[chatbot, state],
|
258 |
+
queue=False)
|
259 |
+
|
260 |
+
message.submit(predict,
|
261 |
+
inputs=[
|
262 |
+
message, large_language_model,
|
263 |
+
embedding_model, file,
|
264 |
+
VECTOR_SEARCH_TOP_K, HISTORY_LEN,
|
265 |
+
temperature, top_p, use_web, state
|
266 |
+
],
|
267 |
+
outputs=[message, chatbot, state])
|
268 |
+
gr.Markdown("""提醒:<br>
|
269 |
+
1. 使用时请先上传自己的知识文件,并且文件中不含某些特殊字符,否则将返回error. <br>
|
270 |
+
2. 有任何使用问题,请通过[问题交流区](https://huggingface.co/spaces/thomas-yanxin/LangChain-ChatLLM/discussions)或[Github Issue区](https://github.com/thomas-yanxin/LangChain-ChatGLM-Webui/issues)进行反馈. <br>
|
271 |
+
""")
|
272 |
+
demo.queue().launch(server_name='0.0.0.0', share=False)
|
chatllm.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
from typing import Dict, List, Optional, Tuple, Union
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from langchain.llms.base import LLM
|
7 |
+
from langchain.llms.utils import enforce_stop_tokens
|
8 |
+
from transformers import AutoModel, AutoTokenizer
|
9 |
+
|
10 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
11 |
+
|
12 |
+
DEVICE = "cuda"
|
13 |
+
DEVICE_ID = "0"
|
14 |
+
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
|
15 |
+
|
16 |
+
|
17 |
+
def torch_gc():
|
18 |
+
if torch.cuda.is_available():
|
19 |
+
with torch.cuda.device(CUDA_DEVICE):
|
20 |
+
torch.cuda.empty_cache()
|
21 |
+
torch.cuda.ipc_collect()
|
22 |
+
|
23 |
+
def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
|
24 |
+
# transformer.word_embeddings 占用1层
|
25 |
+
# transformer.final_layernorm 和 lm_head 占用1层
|
26 |
+
# transformer.layers 占用 28 层
|
27 |
+
# 总共30层分配到num_gpus张卡上
|
28 |
+
num_trans_layers = 28
|
29 |
+
per_gpu_layers = 30 / num_gpus
|
30 |
+
|
31 |
+
# bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
|
32 |
+
# windows下 model.device 会被设置成 transformer.word_embeddings.device
|
33 |
+
# linux下 model.device 会被设置成 lm_head.device
|
34 |
+
# 在调用chat或者stream_chat时,input_ids会被放到model.device上
|
35 |
+
# 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
|
36 |
+
# 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
|
37 |
+
device_map = {'transformer.word_embeddings': 0,
|
38 |
+
'transformer.final_layernorm': 0, 'lm_head': 0}
|
39 |
+
|
40 |
+
used = 2
|
41 |
+
gpu_target = 0
|
42 |
+
for i in range(num_trans_layers):
|
43 |
+
if used >= per_gpu_layers:
|
44 |
+
gpu_target += 1
|
45 |
+
used = 0
|
46 |
+
assert gpu_target < num_gpus
|
47 |
+
device_map[f'transformer.layers.{i}'] = gpu_target
|
48 |
+
used += 1
|
49 |
+
|
50 |
+
return device_map
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
class ChatLLM(LLM):
|
55 |
+
max_token: int = 10000
|
56 |
+
temperature: float = 0.1
|
57 |
+
top_p = 0.9
|
58 |
+
history = []
|
59 |
+
tokenizer: object = None
|
60 |
+
model: object = None
|
61 |
+
|
62 |
+
def __init__(self):
|
63 |
+
super().__init__()
|
64 |
+
|
65 |
+
@property
|
66 |
+
def _llm_type(self) -> str:
|
67 |
+
return "ChatLLM"
|
68 |
+
|
69 |
+
def _call(self,
|
70 |
+
prompt: str,
|
71 |
+
stop: Optional[List[str]] = None) -> str:
|
72 |
+
|
73 |
+
if self.model == 'Minimax':
|
74 |
+
import requests
|
75 |
+
|
76 |
+
group_id = os.getenv('group_id')
|
77 |
+
api_key = os.getenv('api_key')
|
78 |
+
|
79 |
+
url = f'https://api.minimax.chat/v1/text/chatcompletion?GroupId={group_id}'
|
80 |
+
headers = {
|
81 |
+
"Authorization": f"Bearer {api_key}",
|
82 |
+
"Content-Type": "application/json"
|
83 |
+
}
|
84 |
+
request_body = {
|
85 |
+
"model": "abab5-chat",
|
86 |
+
"tokens_to_generate": 512,
|
87 |
+
'messages': []
|
88 |
+
}
|
89 |
+
|
90 |
+
for i in self.history:
|
91 |
+
h_input = i[0]
|
92 |
+
h_reply = i[1]
|
93 |
+
request_body['messages'].append({
|
94 |
+
"sender_type": "USER",
|
95 |
+
"text": h_input
|
96 |
+
})
|
97 |
+
request_body['messages'].append({"sender_type": "BOT", "text": h_reply})
|
98 |
+
|
99 |
+
request_body['messages'].append({"sender_type": "USER", "text": prompt})
|
100 |
+
resp = requests.post(url, headers=headers, json=request_body)
|
101 |
+
response = resp.json()['reply']
|
102 |
+
# 将当次的ai回复内容加入messages
|
103 |
+
request_body['messages'].append({"sender_type": "BOT", "text": response})
|
104 |
+
self.history.append((prompt, response))
|
105 |
+
|
106 |
+
else:
|
107 |
+
|
108 |
+
response, _ = self.model.chat(
|
109 |
+
self.tokenizer,
|
110 |
+
prompt,
|
111 |
+
history=self.history,
|
112 |
+
max_length=self.max_token,
|
113 |
+
temperature=self.temperature,
|
114 |
+
)
|
115 |
+
torch_gc()
|
116 |
+
if stop is not None:
|
117 |
+
response = enforce_stop_tokens(response, stop)
|
118 |
+
self.history = self.history+[[None, response]]
|
119 |
+
return response
|
120 |
+
|
121 |
+
def load_model(self,
|
122 |
+
model_name_or_path: str = "THUDM/chatglm-6b-int4",
|
123 |
+
llm_device=DEVICE,
|
124 |
+
device_map: Optional[Dict[str, int]] = None,
|
125 |
+
**kwargs):
|
126 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
127 |
+
model_name_or_path,
|
128 |
+
trust_remote_code=True
|
129 |
+
)
|
130 |
+
if torch.cuda.is_available() and llm_device.lower().startswith("cuda"):
|
131 |
+
# 根据当前设备GPU数量决定是否进行多卡部署
|
132 |
+
num_gpus = torch.cuda.device_count()
|
133 |
+
if num_gpus < 2 and device_map is None:
|
134 |
+
self.model = (
|
135 |
+
AutoModel.from_pretrained(
|
136 |
+
model_name_or_path,
|
137 |
+
trust_remote_code=True,
|
138 |
+
**kwargs)
|
139 |
+
.half()
|
140 |
+
.cuda()
|
141 |
+
)
|
142 |
+
else:
|
143 |
+
from accelerate import dispatch_model
|
144 |
+
|
145 |
+
model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, **kwargs).half()
|
146 |
+
# 可传入device_map自定义每张卡的部署情况
|
147 |
+
if device_map is None:
|
148 |
+
device_map = auto_configure_device_map(num_gpus)
|
149 |
+
|
150 |
+
self.model = dispatch_model(model, device_map=device_map)
|
151 |
+
else:
|
152 |
+
self.model = (
|
153 |
+
AutoModel.from_pretrained(
|
154 |
+
model_name_or_path,
|
155 |
+
trust_remote_code=True)
|
156 |
+
.float()
|
157 |
+
.to(llm_device)
|
158 |
+
)
|
159 |
+
self.model = self.model.eval()
|
chinese_text_splitter.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
from langchain.text_splitter import CharacterTextSplitter
|
5 |
+
|
6 |
+
|
7 |
+
class ChineseTextSplitter(CharacterTextSplitter):
|
8 |
+
def __init__(self, pdf: bool = False, **kwargs):
|
9 |
+
super().__init__(**kwargs)
|
10 |
+
self.pdf = pdf
|
11 |
+
|
12 |
+
def split_text(self, text: str) -> List[str]:
|
13 |
+
if self.pdf:
|
14 |
+
text = re.sub(r"\n{3,}", "\n", text)
|
15 |
+
text = re.sub('\s', ' ', text)
|
16 |
+
text = text.replace("\n\n", "")
|
17 |
+
sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :;
|
18 |
+
sent_list = []
|
19 |
+
for ele in sent_sep_pattern.split(text):
|
20 |
+
if sent_sep_pattern.match(ele) and sent_list:
|
21 |
+
sent_list[-1] += ele
|
22 |
+
elif ele:
|
23 |
+
sent_list.append(ele)
|
24 |
+
return sent_list
|
nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:25a5a19c7ced7b2bac3831da5bc0afcc2c34e5dd01cd4f361bb799949a696238
|
3 |
+
size 6138625
|
nltk_data/tokenizers/punkt/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
nltk_data/tokenizers/punkt/PY3/README
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
2 |
+
|
3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
5 |
+
|
6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
8 |
+
and chapter 3.8 of the NLTK book:
|
9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
10 |
+
|
11 |
+
There are pretrained tokenizers for the following languages:
|
12 |
+
|
13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
14 |
+
=======================================================================================================================================================================
|
15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
16 |
+
Literarni Noviny
|
17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
24 |
+
(American)
|
25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
29 |
+
Text Bank (Suomen Kielen newspapers
|
30 |
+
Tekstipankki)
|
31 |
+
Finnish Center for IT Science
|
32 |
+
(CSC)
|
33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
35 |
+
(European)
|
36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
38 |
+
(Switzerland) CD-ROM
|
39 |
+
(Uses "ss"
|
40 |
+
instead of "ß")
|
41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
47 |
+
(Bokmål and Information Technologies,
|
48 |
+
Nynorsk) Bergen
|
49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
51 |
+
(http://www.nkjp.pl/)
|
52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
54 |
+
(Brazilian) (Linguateca)
|
55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
57 |
+
Slovene Academy for Arts
|
58 |
+
and Sciences
|
59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
61 |
+
(European)
|
62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
64 |
+
(and some other texts)
|
65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
67 |
+
(Türkçe Derlem Projesi)
|
68 |
+
University of Ankara
|
69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
70 |
+
|
71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
72 |
+
Unicode using the codecs module.
|
73 |
+
|
74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
75 |
+
Computational Linguistics 32: 485-525.
|
76 |
+
|
77 |
+
---- Training Code ----
|
78 |
+
|
79 |
+
# import punkt
|
80 |
+
import nltk.tokenize.punkt
|
81 |
+
|
82 |
+
# Make a new Tokenizer
|
83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
84 |
+
|
85 |
+
# Read in training corpus (one example: Slovene)
|
86 |
+
import codecs
|
87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
88 |
+
|
89 |
+
# Train tokenizer
|
90 |
+
tokenizer.train(text)
|
91 |
+
|
92 |
+
# Dump pickled tokenizer
|
93 |
+
import pickle
|
94 |
+
out = open("slovene.pickle","wb")
|
95 |
+
pickle.dump(tokenizer, out)
|
96 |
+
out.close()
|
97 |
+
|
98 |
+
---------
|
nltk_data/tokenizers/punkt/PY3/czech.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64b0734b6fbe8e8d7cac79f48d1dd9f853824e57c4e3594dadd74ba2c1d97f50
|
3 |
+
size 1119050
|
nltk_data/tokenizers/punkt/PY3/danish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6189c7dd254e29e2bd406a7f6a4336297c8953214792466a790ea4444223ceb3
|
3 |
+
size 1191710
|
nltk_data/tokenizers/punkt/PY3/dutch.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fda0d6a13f02e8898daec7fe923da88e25abe081bcfa755c0e015075c215fe4c
|
3 |
+
size 693759
|
nltk_data/tokenizers/punkt/PY3/english.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
|
3 |
+
size 406697
|
nltk_data/tokenizers/punkt/PY3/estonian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b364f72538d17b146a98009ad239a8096ce6c0a8b02958c0bc776ecd0c58a25f
|
3 |
+
size 1499502
|
nltk_data/tokenizers/punkt/PY3/finnish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a4b5ff5500ee851c456f9dd40d5fc0d8c1859c88eb3178de1317d26b7d22833
|
3 |
+
size 1852226
|
nltk_data/tokenizers/punkt/PY3/french.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28e3a4cd2971989b3cb9fd3433a6f15d17981e464db2be039364313b5de94f29
|
3 |
+
size 553575
|
nltk_data/tokenizers/punkt/PY3/german.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ddcbbe85e2042a019b1a6e37fd8c153286c38ba201fae0f5bfd9a3f74abae25c
|
3 |
+
size 1463575
|
nltk_data/tokenizers/punkt/PY3/greek.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:85dabc44ab90a5f208ef37ff6b4892ebe7e740f71fb4da47cfd95417ca3e22fd
|
3 |
+
size 876006
|
nltk_data/tokenizers/punkt/PY3/italian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68a94007b1e4ffdc4d1a190185ca5442c3dafeb17ab39d30329e84cd74a43947
|
3 |
+
size 615089
|
nltk_data/tokenizers/punkt/PY3/malayalam.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
|
3 |
+
size 221207
|
nltk_data/tokenizers/punkt/PY3/norwegian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ff7a46d1438b311457d15d7763060b8d3270852c1850fd788c5cee194dc4a1d
|
3 |
+
size 1181271
|
nltk_data/tokenizers/punkt/PY3/polish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:624900ae3ddfb4854a98c5d3b8b1c9bb719975f33fee61ce1441dab9f8a00718
|
3 |
+
size 1738386
|
nltk_data/tokenizers/punkt/PY3/portuguese.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02a0b7b25c3c7471e1791b66a31bbb530afbb0160aee4fcecf0107652067b4a1
|
3 |
+
size 611919
|
nltk_data/tokenizers/punkt/PY3/russian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e
|
3 |
+
size 33020
|
nltk_data/tokenizers/punkt/PY3/slovene.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52ef2cc0ed27d79b3aa635cbbc40ad811883a75a4b8a8be1ae406972870fd864
|
3 |
+
size 734444
|
nltk_data/tokenizers/punkt/PY3/spanish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:164a50fadc5a49f8ec7426eae11d3111ee752b48a3ef373d47745011192a5984
|
3 |
+
size 562337
|
nltk_data/tokenizers/punkt/PY3/swedish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0f7d538bfd5266633b09e842cd92e9e0ac10f1d923bf211e1497972ddc47318
|
3 |
+
size 979681
|
nltk_data/tokenizers/punkt/PY3/turkish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae68ef5863728ac5332e87eb1f6bae772ff32a13a4caa2b01a5c68103e853c5b
|
3 |
+
size 1017038
|
nltk_data/tokenizers/punkt/README
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
2 |
+
|
3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
5 |
+
|
6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
8 |
+
and chapter 3.8 of the NLTK book:
|
9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
10 |
+
|
11 |
+
There are pretrained tokenizers for the following languages:
|
12 |
+
|
13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
14 |
+
=======================================================================================================================================================================
|
15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
16 |
+
Literarni Noviny
|
17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
24 |
+
(American)
|
25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
29 |
+
Text Bank (Suomen Kielen newspapers
|
30 |
+
Tekstipankki)
|
31 |
+
Finnish Center for IT Science
|
32 |
+
(CSC)
|
33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
35 |
+
(European)
|
36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
38 |
+
(Switzerland) CD-ROM
|
39 |
+
(Uses "ss"
|
40 |
+
instead of "ß")
|
41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
47 |
+
(Bokmål and Information Technologies,
|
48 |
+
Nynorsk) Bergen
|
49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
51 |
+
(http://www.nkjp.pl/)
|
52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
54 |
+
(Brazilian) (Linguateca)
|
55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
57 |
+
Slovene Academy for Arts
|
58 |
+
and Sciences
|
59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
61 |
+
(European)
|
62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
64 |
+
(and some other texts)
|
65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
67 |
+
(Türkçe Derlem Projesi)
|
68 |
+
University of Ankara
|
69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
70 |
+
|
71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
72 |
+
Unicode using the codecs module.
|
73 |
+
|
74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
75 |
+
Computational Linguistics 32: 485-525.
|
76 |
+
|
77 |
+
---- Training Code ----
|
78 |
+
|
79 |
+
# import punkt
|
80 |
+
import nltk.tokenize.punkt
|
81 |
+
|
82 |
+
# Make a new Tokenizer
|
83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
84 |
+
|
85 |
+
# Read in training corpus (one example: Slovene)
|
86 |
+
import codecs
|
87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
88 |
+
|
89 |
+
# Train tokenizer
|
90 |
+
tokenizer.train(text)
|
91 |
+
|
92 |
+
# Dump pickled tokenizer
|
93 |
+
import pickle
|
94 |
+
out = open("slovene.pickle","wb")
|
95 |
+
pickle.dump(tokenizer, out)
|
96 |
+
out.close()
|
97 |
+
|
98 |
+
---------
|
nltk_data/tokenizers/punkt/czech.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c085f6283bed0f1390d36a55d126ccc29c9b4dfcd2705e862b1711b7c6bb5ab
|
3 |
+
size 1424691
|
nltk_data/tokenizers/punkt/danish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df8366ad67db22b1f838cd63fcc589a6006faf66d7a46be5312d9c487ce2c811
|
3 |
+
size 1427491
|
nltk_data/tokenizers/punkt/dutch.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12f46024d3c840529b56ac2a3118b80b8dc77705734bcdd71ff7c46f5808395e
|
3 |
+
size 839761
|
nltk_data/tokenizers/punkt/english.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e2d25d5adc3ee51ac192ce611bdc5378acae7136af5d3c52c2903c669f9aff0
|
3 |
+
size 495006
|
nltk_data/tokenizers/punkt/estonian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9083ef6ef3d5b9992a8a4ea09e889a87be75e2122ad25648307178960634cd8d
|
3 |
+
size 1803082
|
nltk_data/tokenizers/punkt/finnish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce1b4dbe72e400e902220061457f9bd5f491ec37f7af468bc4694980c9623817
|
3 |
+
size 2192034
|
nltk_data/tokenizers/punkt/french.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e0be48e38a54232ea88c817cf34c1f1f8f44954e21f118c65af9f2d6a43cdbd
|
3 |
+
size 664010
|
nltk_data/tokenizers/punkt/german.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:251c2f4bde61ab3fc1cabc2158c62e6ab285fddd16267d2d3885f71e3ed61c7f
|
3 |
+
size 1708012
|
nltk_data/tokenizers/punkt/greek.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8b3a6da093ed2df084ded6dc49c88f101d47a0c69398f19ae50af6785d93b1c5
|
3 |
+
size 2042362
|
nltk_data/tokenizers/punkt/italian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41e6aaf554e696703b3d41890973368b9b2f17c342745c07369742928d363731
|
3 |
+
size 748532
|
nltk_data/tokenizers/punkt/malayalam.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
|
3 |
+
size 221207
|
nltk_data/tokenizers/punkt/norwegian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45828b0d57da9a66f107ea277752f6c1cbde51b9f9feba173b2c6e2edb28af21
|
3 |
+
size 1422756
|
nltk_data/tokenizers/punkt/polish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79d09a9406f90dbf20f8cbb0a04a7aa0bdb4b71604eda31e97c3df2de5cd2837
|
3 |
+
size 2287622
|
nltk_data/tokenizers/punkt/portuguese.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c09561e770b6f17e3d85112f83007ff1397dec66c23acb15b9fe046eaefd2e86
|
3 |
+
size 739845
|
nltk_data/tokenizers/punkt/russian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909
|
3 |
+
size 33027
|
nltk_data/tokenizers/punkt/slovene.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2dc83b900e347c16ed0123868369107cd19d1a6125d099e26889580c4dbba277
|
3 |
+
size 939791
|
nltk_data/tokenizers/punkt/spanish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:61afae663cb2968148e0e27d5a3fcd4a5f19648688800caf8e7f998eaa75f4a7
|
3 |
+
size 680466
|
nltk_data/tokenizers/punkt/swedish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e5794208b223b2a54bd4ed565045172f9c6ef80b5bead94f71a5499455cda955
|
3 |
+
size 1168214
|
nltk_data/tokenizers/punkt/turkish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2abb5d7ec4e80aeeb994407254a2e1a0928520727cc25f7bd3fc9ce0b5a78c1
|
3 |
+
size 1363199
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
langchain
|
3 |
+
accelerate
|
4 |
+
duckduckgo_search
|
5 |
+
transformers==4.27.1
|
6 |
+
unstructured[local-inference]
|
7 |
+
layoutparser[layoutmodels,tesseract]
|
8 |
+
nltk
|
9 |
+
sentence-transformers
|
10 |
+
beautifulsoup4
|
11 |
+
icetk
|
12 |
+
cpm_kernels
|
13 |
+
faiss-cpu
|
14 |
+
gradio
|
15 |
+
nltk
|
16 |
+
torch
|
17 |
+
torchvision
|
18 |
+
protobuf==3.19
|
19 |
+
jina
|