ytjoh thomas-yanxin commited on
Commit
ad8d0c1
0 Parent(s):

Duplicate from thomas-yanxin/LangChain-ChatLLM

Browse files

Co-authored-by: thomas Yan <[email protected]>

Files changed (48) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +272 -0
  4. chatllm.py +159 -0
  5. chinese_text_splitter.py +24 -0
  6. nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +3 -0
  7. nltk_data/tokenizers/punkt/.DS_Store +0 -0
  8. nltk_data/tokenizers/punkt/PY3/README +98 -0
  9. nltk_data/tokenizers/punkt/PY3/czech.pickle +3 -0
  10. nltk_data/tokenizers/punkt/PY3/danish.pickle +3 -0
  11. nltk_data/tokenizers/punkt/PY3/dutch.pickle +3 -0
  12. nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
  13. nltk_data/tokenizers/punkt/PY3/estonian.pickle +3 -0
  14. nltk_data/tokenizers/punkt/PY3/finnish.pickle +3 -0
  15. nltk_data/tokenizers/punkt/PY3/french.pickle +3 -0
  16. nltk_data/tokenizers/punkt/PY3/german.pickle +3 -0
  17. nltk_data/tokenizers/punkt/PY3/greek.pickle +3 -0
  18. nltk_data/tokenizers/punkt/PY3/italian.pickle +3 -0
  19. nltk_data/tokenizers/punkt/PY3/malayalam.pickle +3 -0
  20. nltk_data/tokenizers/punkt/PY3/norwegian.pickle +3 -0
  21. nltk_data/tokenizers/punkt/PY3/polish.pickle +3 -0
  22. nltk_data/tokenizers/punkt/PY3/portuguese.pickle +3 -0
  23. nltk_data/tokenizers/punkt/PY3/russian.pickle +3 -0
  24. nltk_data/tokenizers/punkt/PY3/slovene.pickle +3 -0
  25. nltk_data/tokenizers/punkt/PY3/spanish.pickle +3 -0
  26. nltk_data/tokenizers/punkt/PY3/swedish.pickle +3 -0
  27. nltk_data/tokenizers/punkt/PY3/turkish.pickle +3 -0
  28. nltk_data/tokenizers/punkt/README +98 -0
  29. nltk_data/tokenizers/punkt/czech.pickle +3 -0
  30. nltk_data/tokenizers/punkt/danish.pickle +3 -0
  31. nltk_data/tokenizers/punkt/dutch.pickle +3 -0
  32. nltk_data/tokenizers/punkt/english.pickle +3 -0
  33. nltk_data/tokenizers/punkt/estonian.pickle +3 -0
  34. nltk_data/tokenizers/punkt/finnish.pickle +3 -0
  35. nltk_data/tokenizers/punkt/french.pickle +3 -0
  36. nltk_data/tokenizers/punkt/german.pickle +3 -0
  37. nltk_data/tokenizers/punkt/greek.pickle +3 -0
  38. nltk_data/tokenizers/punkt/italian.pickle +3 -0
  39. nltk_data/tokenizers/punkt/malayalam.pickle +3 -0
  40. nltk_data/tokenizers/punkt/norwegian.pickle +3 -0
  41. nltk_data/tokenizers/punkt/polish.pickle +3 -0
  42. nltk_data/tokenizers/punkt/portuguese.pickle +3 -0
  43. nltk_data/tokenizers/punkt/russian.pickle +3 -0
  44. nltk_data/tokenizers/punkt/slovene.pickle +3 -0
  45. nltk_data/tokenizers/punkt/spanish.pickle +3 -0
  46. nltk_data/tokenizers/punkt/swedish.pickle +3 -0
  47. nltk_data/tokenizers/punkt/turkish.pickle +3 -0
  48. requirements.txt +19 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: LangChain ChatLLM
3
+ emoji: ⚡
4
+ colorFrom: green
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.27.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: thomas-yanxin/LangChain-ChatLLM
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import nltk
5
+ import sentence_transformers
6
+ import torch
7
+ from duckduckgo_search import ddg
8
+ from duckduckgo_search.utils import SESSION
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.document_loaders import UnstructuredFileLoader
11
+ from langchain.embeddings import JinaEmbeddings
12
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
13
+ from langchain.prompts import PromptTemplate
14
+ from langchain.prompts.prompt import PromptTemplate
15
+ from langchain.vectorstores import FAISS
16
+
17
+ from chatllm import ChatLLM
18
+ from chinese_text_splitter import ChineseTextSplitter
19
+
20
+ nltk.data.path.append('./nltk_data')
21
+
22
+ embedding_model_dict = {
23
+ "ernie-tiny": "nghuyong/ernie-3.0-nano-zh",
24
+ "ernie-base": "nghuyong/ernie-3.0-base-zh",
25
+ "text2vec-base": "GanymedeNil/text2vec-base-chinese",
26
+ "ViT-B-32": 'ViT-B-32::laion2b-s34b-b79k'
27
+ }
28
+
29
+ llm_model_dict = {
30
+ "ChatGLM-6B-int8": "THUDM/chatglm-6b-int8",
31
+ "ChatGLM-6B-int4": "THUDM/chatglm-6b-int4",
32
+ "ChatGLM-6b-int4-qe": "THUDM/chatglm-6b-int4-qe",
33
+ "Minimax": "Minimax"
34
+ }
35
+
36
+ DEVICE = "cuda" if torch.cuda.is_available(
37
+ ) else "mps" if torch.backends.mps.is_available() else "cpu"
38
+
39
+
40
+ def search_web(query):
41
+
42
+ SESSION.proxies = {
43
+ "http": f"socks5h://localhost:7890",
44
+ "https": f"socks5h://localhost:7890"
45
+ }
46
+ results = ddg(query)
47
+ web_content = ''
48
+ if results:
49
+ for result in results:
50
+ web_content += result['body']
51
+ return web_content
52
+
53
+
54
+ def load_file(filepath):
55
+ if filepath.lower().endswith(".pdf"):
56
+ loader = UnstructuredFileLoader(filepath)
57
+ textsplitter = ChineseTextSplitter(pdf=True)
58
+ docs = loader.load_and_split(textsplitter)
59
+ else:
60
+ loader = UnstructuredFileLoader(filepath, mode="elements")
61
+ textsplitter = ChineseTextSplitter(pdf=False)
62
+ docs = loader.load_and_split(text_splitter=textsplitter)
63
+ return docs
64
+
65
+
66
+ def init_knowledge_vector_store(embedding_model, filepath):
67
+ if embedding_model == "ViT-B-32":
68
+ jina_auth_token = os.getenv('jina_auth_token')
69
+ embeddings = JinaEmbeddings(
70
+ jina_auth_token=jina_auth_token,
71
+ model_name=embedding_model_dict[embedding_model])
72
+ else:
73
+ embeddings = HuggingFaceEmbeddings(
74
+ model_name=embedding_model_dict[embedding_model], )
75
+ embeddings.client = sentence_transformers.SentenceTransformer(
76
+ embeddings.model_name, device=DEVICE)
77
+
78
+ docs = load_file(filepath)
79
+
80
+ vector_store = FAISS.from_documents(docs, embeddings)
81
+ return vector_store
82
+
83
+
84
+ def get_knowledge_based_answer(query,
85
+ large_language_model,
86
+ vector_store,
87
+ VECTOR_SEARCH_TOP_K,
88
+ web_content,
89
+ history_len,
90
+ temperature,
91
+ top_p,
92
+ chat_history=[]):
93
+ if web_content:
94
+ prompt_template = f"""基于以下已知信息,简洁和专业的来回答用户的问题。
95
+ 如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。
96
+ 已知网络检索内容:{web_content}""" + """
97
+ 已知内容:
98
+ {context}
99
+ 问题:
100
+ {question}"""
101
+ else:
102
+ prompt_template = """基于以下已知信息,请简洁并专业地回答用户的问题。
103
+ 如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"。不允许在答案中添加编造成分。另外,答案请使用中文。
104
+
105
+ 已知内容:
106
+ {context}
107
+
108
+ 问题:
109
+ {question}"""
110
+ prompt = PromptTemplate(template=prompt_template,
111
+ input_variables=["context", "question"])
112
+ chatLLM = ChatLLM()
113
+ chatLLM.history = chat_history[-history_len:] if history_len > 0 else []
114
+ if large_language_model == "Minimax":
115
+ chatLLM.model = 'Minimax'
116
+ else:
117
+ chatLLM.load_model(
118
+ model_name_or_path=llm_model_dict[large_language_model])
119
+ chatLLM.temperature = temperature
120
+ chatLLM.top_p = top_p
121
+
122
+ knowledge_chain = RetrievalQA.from_llm(
123
+ llm=chatLLM,
124
+ retriever=vector_store.as_retriever(
125
+ search_kwargs={"k": VECTOR_SEARCH_TOP_K}),
126
+ prompt=prompt)
127
+ knowledge_chain.combine_documents_chain.document_prompt = PromptTemplate(
128
+ input_variables=["page_content"], template="{page_content}")
129
+
130
+ knowledge_chain.return_source_documents = True
131
+
132
+ result = knowledge_chain({"query": query})
133
+ return result
134
+
135
+
136
+ def clear_session():
137
+ return '', None
138
+
139
+
140
+ def predict(input,
141
+ large_language_model,
142
+ embedding_model,
143
+ file_obj,
144
+ VECTOR_SEARCH_TOP_K,
145
+ history_len,
146
+ temperature,
147
+ top_p,
148
+ use_web,
149
+ history=None):
150
+ if history == None:
151
+ history = []
152
+ print(file_obj.name)
153
+ vector_store = init_knowledge_vector_store(embedding_model, file_obj.name)
154
+ if use_web == 'True':
155
+ web_content = search_web(query=input)
156
+ else:
157
+ web_content = ''
158
+ resp = get_knowledge_based_answer(
159
+ query=input,
160
+ large_language_model=large_language_model,
161
+ vector_store=vector_store,
162
+ VECTOR_SEARCH_TOP_K=VECTOR_SEARCH_TOP_K,
163
+ web_content=web_content,
164
+ chat_history=history,
165
+ history_len=history_len,
166
+ temperature=temperature,
167
+ top_p=top_p,
168
+ )
169
+ print(resp)
170
+ history.append((input, resp['result']))
171
+ return '', history, history
172
+
173
+
174
+ if __name__ == "__main__":
175
+ block = gr.Blocks()
176
+ with block as demo:
177
+ gr.Markdown("""<h1><center>LangChain-ChatLLM-Webui</center></h1>
178
+ <center><font size=3>
179
+ 本项目基于LangChain和大型语言模型系列模型, 提供基于本地知识的自动问答应用. <br>
180
+ 目前项目提供基于<a href='https://github.com/THUDM/ChatGLM-6B' target="_blank">ChatGLM-6B </a>系列、Minimax的LLM和包括text2vec-base-chinese、ernie-3.0-zh系列以及由<a href='https://cloud.jina.ai/user/inference' target="_blank">Jina</a>提供的ViT-B-32::laion2b-s34b-b79k等多个Embedding模型, 支持上传 txt、docx、md等文本格式文件. <br>
181
+ 后续将提供更加多样化的LLM、Embedding和参数选项供用户尝试, 欢迎关注<a href='https://github.com/thomas-yanxin/LangChain-ChatGLM-Webui' target="_blank">Github地址</a>. <br>
182
+ 本项目已内置开发者自己的key,用户无需输入自己的相关key. <br>
183
+ 当然,更推荐您点击右上角的<strong>Duplicate this Space</strong>,将项目Fork到自己的Space中,保护个人隐私,且避免排队!
184
+ </center></font>
185
+ """)
186
+ with gr.Row():
187
+ with gr.Column(scale=1):
188
+ model_choose = gr.Accordion("模型选择")
189
+ with model_choose:
190
+ large_language_model = gr.Dropdown(
191
+ list(llm_model_dict.keys()),
192
+ label="large language model",
193
+ value="ChatGLM-6B-int4")
194
+
195
+ embedding_model = gr.Dropdown(list(
196
+ embedding_model_dict.keys()),
197
+ label="Embedding model",
198
+ value="text2vec-base")
199
+
200
+ file = gr.File(label='请上传知识库文件, 目前支持txt、docx、md格式',
201
+ file_types=['.txt', '.md', '.docx'])
202
+
203
+ use_web = gr.Radio(["True", "False"],
204
+ label="Web Search",
205
+ value="False")
206
+ model_argument = gr.Accordion("模型参数配置")
207
+
208
+ with model_argument:
209
+
210
+ VECTOR_SEARCH_TOP_K = gr.Slider(
211
+ 1,
212
+ 10,
213
+ value=6,
214
+ step=1,
215
+ label="vector search top k",
216
+ interactive=True)
217
+
218
+ HISTORY_LEN = gr.Slider(0,
219
+ 3,
220
+ value=0,
221
+ step=1,
222
+ label="history len",
223
+ interactive=True)
224
+
225
+ temperature = gr.Slider(0,
226
+ 1,
227
+ value=0.01,
228
+ step=0.01,
229
+ label="temperature",
230
+ interactive=True)
231
+ top_p = gr.Slider(0,
232
+ 1,
233
+ value=0.9,
234
+ step=0.1,
235
+ label="top_p",
236
+ interactive=True)
237
+
238
+ with gr.Column(scale=4):
239
+ chatbot = gr.Chatbot(label='ChatLLM').style(height=600)
240
+ message = gr.Textbox(label='请输入问题')
241
+ state = gr.State()
242
+
243
+ with gr.Row():
244
+ clear_history = gr.Button("🧹 清除历史对话")
245
+ send = gr.Button("🚀 发送")
246
+
247
+ send.click(predict,
248
+ inputs=[
249
+ message, large_language_model,
250
+ embedding_model, file, VECTOR_SEARCH_TOP_K,
251
+ HISTORY_LEN, temperature, top_p, use_web,
252
+ state
253
+ ],
254
+ outputs=[message, chatbot, state])
255
+ clear_history.click(fn=clear_session,
256
+ inputs=[],
257
+ outputs=[chatbot, state],
258
+ queue=False)
259
+
260
+ message.submit(predict,
261
+ inputs=[
262
+ message, large_language_model,
263
+ embedding_model, file,
264
+ VECTOR_SEARCH_TOP_K, HISTORY_LEN,
265
+ temperature, top_p, use_web, state
266
+ ],
267
+ outputs=[message, chatbot, state])
268
+ gr.Markdown("""提醒:<br>
269
+ 1. 使用时请先上传自己的知识文件,并且文件中不含某些特殊字符,否则将返回error. <br>
270
+ 2. 有任何使用问题,请通过[问题交流区](https://huggingface.co/spaces/thomas-yanxin/LangChain-ChatLLM/discussions)或[Github Issue区](https://github.com/thomas-yanxin/LangChain-ChatGLM-Webui/issues)进行反馈. <br>
271
+ """)
272
+ demo.queue().launch(server_name='0.0.0.0', share=False)
chatllm.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ from typing import Dict, List, Optional, Tuple, Union
4
+
5
+ import torch
6
+ from langchain.llms.base import LLM
7
+ from langchain.llms.utils import enforce_stop_tokens
8
+ from transformers import AutoModel, AutoTokenizer
9
+
10
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
11
+
12
+ DEVICE = "cuda"
13
+ DEVICE_ID = "0"
14
+ CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
15
+
16
+
17
+ def torch_gc():
18
+ if torch.cuda.is_available():
19
+ with torch.cuda.device(CUDA_DEVICE):
20
+ torch.cuda.empty_cache()
21
+ torch.cuda.ipc_collect()
22
+
23
+ def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
24
+ # transformer.word_embeddings 占用1层
25
+ # transformer.final_layernorm 和 lm_head 占用1层
26
+ # transformer.layers 占用 28 层
27
+ # 总共30层分配到num_gpus张卡上
28
+ num_trans_layers = 28
29
+ per_gpu_layers = 30 / num_gpus
30
+
31
+ # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
32
+ # windows下 model.device 会被设置成 transformer.word_embeddings.device
33
+ # linux下 model.device 会被设置成 lm_head.device
34
+ # 在调用chat或者stream_chat时,input_ids会被放到model.device上
35
+ # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
36
+ # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
37
+ device_map = {'transformer.word_embeddings': 0,
38
+ 'transformer.final_layernorm': 0, 'lm_head': 0}
39
+
40
+ used = 2
41
+ gpu_target = 0
42
+ for i in range(num_trans_layers):
43
+ if used >= per_gpu_layers:
44
+ gpu_target += 1
45
+ used = 0
46
+ assert gpu_target < num_gpus
47
+ device_map[f'transformer.layers.{i}'] = gpu_target
48
+ used += 1
49
+
50
+ return device_map
51
+
52
+
53
+
54
+ class ChatLLM(LLM):
55
+ max_token: int = 10000
56
+ temperature: float = 0.1
57
+ top_p = 0.9
58
+ history = []
59
+ tokenizer: object = None
60
+ model: object = None
61
+
62
+ def __init__(self):
63
+ super().__init__()
64
+
65
+ @property
66
+ def _llm_type(self) -> str:
67
+ return "ChatLLM"
68
+
69
+ def _call(self,
70
+ prompt: str,
71
+ stop: Optional[List[str]] = None) -> str:
72
+
73
+ if self.model == 'Minimax':
74
+ import requests
75
+
76
+ group_id = os.getenv('group_id')
77
+ api_key = os.getenv('api_key')
78
+
79
+ url = f'https://api.minimax.chat/v1/text/chatcompletion?GroupId={group_id}'
80
+ headers = {
81
+ "Authorization": f"Bearer {api_key}",
82
+ "Content-Type": "application/json"
83
+ }
84
+ request_body = {
85
+ "model": "abab5-chat",
86
+ "tokens_to_generate": 512,
87
+ 'messages': []
88
+ }
89
+
90
+ for i in self.history:
91
+ h_input = i[0]
92
+ h_reply = i[1]
93
+ request_body['messages'].append({
94
+ "sender_type": "USER",
95
+ "text": h_input
96
+ })
97
+ request_body['messages'].append({"sender_type": "BOT", "text": h_reply})
98
+
99
+ request_body['messages'].append({"sender_type": "USER", "text": prompt})
100
+ resp = requests.post(url, headers=headers, json=request_body)
101
+ response = resp.json()['reply']
102
+ # 将当次的ai回复内容加入messages
103
+ request_body['messages'].append({"sender_type": "BOT", "text": response})
104
+ self.history.append((prompt, response))
105
+
106
+ else:
107
+
108
+ response, _ = self.model.chat(
109
+ self.tokenizer,
110
+ prompt,
111
+ history=self.history,
112
+ max_length=self.max_token,
113
+ temperature=self.temperature,
114
+ )
115
+ torch_gc()
116
+ if stop is not None:
117
+ response = enforce_stop_tokens(response, stop)
118
+ self.history = self.history+[[None, response]]
119
+ return response
120
+
121
+ def load_model(self,
122
+ model_name_or_path: str = "THUDM/chatglm-6b-int4",
123
+ llm_device=DEVICE,
124
+ device_map: Optional[Dict[str, int]] = None,
125
+ **kwargs):
126
+ self.tokenizer = AutoTokenizer.from_pretrained(
127
+ model_name_or_path,
128
+ trust_remote_code=True
129
+ )
130
+ if torch.cuda.is_available() and llm_device.lower().startswith("cuda"):
131
+ # 根据当前设备GPU数量决定是否进行多卡部署
132
+ num_gpus = torch.cuda.device_count()
133
+ if num_gpus < 2 and device_map is None:
134
+ self.model = (
135
+ AutoModel.from_pretrained(
136
+ model_name_or_path,
137
+ trust_remote_code=True,
138
+ **kwargs)
139
+ .half()
140
+ .cuda()
141
+ )
142
+ else:
143
+ from accelerate import dispatch_model
144
+
145
+ model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, **kwargs).half()
146
+ # 可传入device_map自定义每张卡的部署情况
147
+ if device_map is None:
148
+ device_map = auto_configure_device_map(num_gpus)
149
+
150
+ self.model = dispatch_model(model, device_map=device_map)
151
+ else:
152
+ self.model = (
153
+ AutoModel.from_pretrained(
154
+ model_name_or_path,
155
+ trust_remote_code=True)
156
+ .float()
157
+ .to(llm_device)
158
+ )
159
+ self.model = self.model.eval()
chinese_text_splitter.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List
3
+
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+
6
+
7
+ class ChineseTextSplitter(CharacterTextSplitter):
8
+ def __init__(self, pdf: bool = False, **kwargs):
9
+ super().__init__(**kwargs)
10
+ self.pdf = pdf
11
+
12
+ def split_text(self, text: str) -> List[str]:
13
+ if self.pdf:
14
+ text = re.sub(r"\n{3,}", "\n", text)
15
+ text = re.sub('\s', ' ', text)
16
+ text = text.replace("\n\n", "")
17
+ sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :;
18
+ sent_list = []
19
+ for ele in sent_sep_pattern.split(text):
20
+ if sent_sep_pattern.match(ele) and sent_list:
21
+ sent_list[-1] += ele
22
+ elif ele:
23
+ sent_list.append(ele)
24
+ return sent_list
nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25a5a19c7ced7b2bac3831da5bc0afcc2c34e5dd01cd4f361bb799949a696238
3
+ size 6138625
nltk_data/tokenizers/punkt/.DS_Store ADDED
Binary file (6.15 kB). View file
 
nltk_data/tokenizers/punkt/PY3/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt/PY3/czech.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64b0734b6fbe8e8d7cac79f48d1dd9f853824e57c4e3594dadd74ba2c1d97f50
3
+ size 1119050
nltk_data/tokenizers/punkt/PY3/danish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6189c7dd254e29e2bd406a7f6a4336297c8953214792466a790ea4444223ceb3
3
+ size 1191710
nltk_data/tokenizers/punkt/PY3/dutch.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fda0d6a13f02e8898daec7fe923da88e25abe081bcfa755c0e015075c215fe4c
3
+ size 693759
nltk_data/tokenizers/punkt/PY3/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
3
+ size 406697
nltk_data/tokenizers/punkt/PY3/estonian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b364f72538d17b146a98009ad239a8096ce6c0a8b02958c0bc776ecd0c58a25f
3
+ size 1499502
nltk_data/tokenizers/punkt/PY3/finnish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a4b5ff5500ee851c456f9dd40d5fc0d8c1859c88eb3178de1317d26b7d22833
3
+ size 1852226
nltk_data/tokenizers/punkt/PY3/french.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28e3a4cd2971989b3cb9fd3433a6f15d17981e464db2be039364313b5de94f29
3
+ size 553575
nltk_data/tokenizers/punkt/PY3/german.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddcbbe85e2042a019b1a6e37fd8c153286c38ba201fae0f5bfd9a3f74abae25c
3
+ size 1463575
nltk_data/tokenizers/punkt/PY3/greek.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85dabc44ab90a5f208ef37ff6b4892ebe7e740f71fb4da47cfd95417ca3e22fd
3
+ size 876006
nltk_data/tokenizers/punkt/PY3/italian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68a94007b1e4ffdc4d1a190185ca5442c3dafeb17ab39d30329e84cd74a43947
3
+ size 615089
nltk_data/tokenizers/punkt/PY3/malayalam.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
3
+ size 221207
nltk_data/tokenizers/punkt/PY3/norwegian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ff7a46d1438b311457d15d7763060b8d3270852c1850fd788c5cee194dc4a1d
3
+ size 1181271
nltk_data/tokenizers/punkt/PY3/polish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:624900ae3ddfb4854a98c5d3b8b1c9bb719975f33fee61ce1441dab9f8a00718
3
+ size 1738386
nltk_data/tokenizers/punkt/PY3/portuguese.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02a0b7b25c3c7471e1791b66a31bbb530afbb0160aee4fcecf0107652067b4a1
3
+ size 611919
nltk_data/tokenizers/punkt/PY3/russian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e
3
+ size 33020
nltk_data/tokenizers/punkt/PY3/slovene.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52ef2cc0ed27d79b3aa635cbbc40ad811883a75a4b8a8be1ae406972870fd864
3
+ size 734444
nltk_data/tokenizers/punkt/PY3/spanish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:164a50fadc5a49f8ec7426eae11d3111ee752b48a3ef373d47745011192a5984
3
+ size 562337
nltk_data/tokenizers/punkt/PY3/swedish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0f7d538bfd5266633b09e842cd92e9e0ac10f1d923bf211e1497972ddc47318
3
+ size 979681
nltk_data/tokenizers/punkt/PY3/turkish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae68ef5863728ac5332e87eb1f6bae772ff32a13a4caa2b01a5c68103e853c5b
3
+ size 1017038
nltk_data/tokenizers/punkt/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt/czech.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c085f6283bed0f1390d36a55d126ccc29c9b4dfcd2705e862b1711b7c6bb5ab
3
+ size 1424691
nltk_data/tokenizers/punkt/danish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df8366ad67db22b1f838cd63fcc589a6006faf66d7a46be5312d9c487ce2c811
3
+ size 1427491
nltk_data/tokenizers/punkt/dutch.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12f46024d3c840529b56ac2a3118b80b8dc77705734bcdd71ff7c46f5808395e
3
+ size 839761
nltk_data/tokenizers/punkt/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e2d25d5adc3ee51ac192ce611bdc5378acae7136af5d3c52c2903c669f9aff0
3
+ size 495006
nltk_data/tokenizers/punkt/estonian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9083ef6ef3d5b9992a8a4ea09e889a87be75e2122ad25648307178960634cd8d
3
+ size 1803082
nltk_data/tokenizers/punkt/finnish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce1b4dbe72e400e902220061457f9bd5f491ec37f7af468bc4694980c9623817
3
+ size 2192034
nltk_data/tokenizers/punkt/french.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e0be48e38a54232ea88c817cf34c1f1f8f44954e21f118c65af9f2d6a43cdbd
3
+ size 664010
nltk_data/tokenizers/punkt/german.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:251c2f4bde61ab3fc1cabc2158c62e6ab285fddd16267d2d3885f71e3ed61c7f
3
+ size 1708012
nltk_data/tokenizers/punkt/greek.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b3a6da093ed2df084ded6dc49c88f101d47a0c69398f19ae50af6785d93b1c5
3
+ size 2042362
nltk_data/tokenizers/punkt/italian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41e6aaf554e696703b3d41890973368b9b2f17c342745c07369742928d363731
3
+ size 748532
nltk_data/tokenizers/punkt/malayalam.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
3
+ size 221207
nltk_data/tokenizers/punkt/norwegian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45828b0d57da9a66f107ea277752f6c1cbde51b9f9feba173b2c6e2edb28af21
3
+ size 1422756
nltk_data/tokenizers/punkt/polish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79d09a9406f90dbf20f8cbb0a04a7aa0bdb4b71604eda31e97c3df2de5cd2837
3
+ size 2287622
nltk_data/tokenizers/punkt/portuguese.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c09561e770b6f17e3d85112f83007ff1397dec66c23acb15b9fe046eaefd2e86
3
+ size 739845
nltk_data/tokenizers/punkt/russian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909
3
+ size 33027
nltk_data/tokenizers/punkt/slovene.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dc83b900e347c16ed0123868369107cd19d1a6125d099e26889580c4dbba277
3
+ size 939791
nltk_data/tokenizers/punkt/spanish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61afae663cb2968148e0e27d5a3fcd4a5f19648688800caf8e7f998eaa75f4a7
3
+ size 680466
nltk_data/tokenizers/punkt/swedish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5794208b223b2a54bd4ed565045172f9c6ef80b5bead94f71a5499455cda955
3
+ size 1168214
nltk_data/tokenizers/punkt/turkish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2abb5d7ec4e80aeeb994407254a2e1a0928520727cc25f7bd3fc9ce0b5a78c1
3
+ size 1363199
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ langchain
3
+ accelerate
4
+ duckduckgo_search
5
+ transformers==4.27.1
6
+ unstructured[local-inference]
7
+ layoutparser[layoutmodels,tesseract]
8
+ nltk
9
+ sentence-transformers
10
+ beautifulsoup4
11
+ icetk
12
+ cpm_kernels
13
+ faiss-cpu
14
+ gradio
15
+ nltk
16
+ torch
17
+ torchvision
18
+ protobuf==3.19
19
+ jina