Zulelee commited on
Commit
5e9cd1d
1 Parent(s): 8d50bff

Upload 254 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. chains/llmchain_with_history.py +22 -0
  3. common/__init__.py +0 -0
  4. configs/__init__.py +8 -0
  5. configs/basic_config.py.example +32 -0
  6. configs/kb_config.py.example +145 -0
  7. configs/model_config.py.example +302 -0
  8. configs/prompt_config.py.example +127 -0
  9. configs/server_config.py.example +137 -0
  10. docs/ES部署指南.md +29 -0
  11. document_loaders/FilteredCSVloader.py +81 -0
  12. document_loaders/__init__.py +4 -0
  13. document_loaders/mydocloader.py +71 -0
  14. document_loaders/myimgloader.py +25 -0
  15. document_loaders/mypdfloader.py +51 -0
  16. document_loaders/mypptloader.py +59 -0
  17. document_loaders/ocr.py +18 -0
  18. embeddings/__init__.py +0 -0
  19. embeddings/add_embedding_keywords.py +79 -0
  20. embeddings/embedding_keywords.txt +3 -0
  21. img/LLM_success.png +0 -0
  22. img/agent_continue.png +0 -0
  23. img/agent_success.png +0 -0
  24. img/chatchat-qrcode.jpg +0 -0
  25. img/chatchat_icon_blue_square_v2.png +0 -0
  26. img/docker_logs.png +0 -0
  27. img/fastapi_docs_026.png +0 -0
  28. img/init_knowledge_base.jpg +0 -0
  29. img/knowledge_base_success.jpg +0 -0
  30. img/langchain+chatglm.png +3 -0
  31. img/langchain+chatglm2.png +0 -0
  32. img/logo-long-chatchat-trans-v2.png +0 -0
  33. img/official_account_qr.png +0 -0
  34. img/official_wechat_mp_account.png +3 -0
  35. img/partners/autodl.svg +0 -0
  36. img/partners/aws.svg +9 -0
  37. img/partners/chatglm.svg +55 -0
  38. img/partners/zhenfund.svg +9 -0
  39. img/qr_code_86.jpg +0 -0
  40. img/qr_code_87.jpg +0 -0
  41. img/qr_code_88.jpg +0 -0
  42. knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-124076-270516.jpg +0 -0
  43. knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-20096-279847.jpg +0 -0
  44. knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-220157-552735.jpg +0 -0
  45. knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-36114-765327.jpg +0 -0
  46. knowledge_base/samples/content/llm/img/分布式训练技术原理-幕布图片-392521-261326.jpg +3 -0
  47. knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-42284-124759.jpg +0 -0
  48. knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-57107-679259.jpg +0 -0
  49. knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-618350-869132.jpg +0 -0
  50. knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-838373-426344.jpg +0 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ img/langchain+chatglm.png filter=lfs diff=lfs merge=lfs -text
37
+ img/official_wechat_mp_account.png filter=lfs diff=lfs merge=lfs -text
38
+ knowledge_base/samples/content/llm/img/分布式训练技术原理-幕布图片-392521-261326.jpg filter=lfs diff=lfs merge=lfs -text
chains/llmchain_with_history.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from server.utils import get_ChatOpenAI
2
+ from configs.model_config import LLM_MODELS, TEMPERATURE
3
+ from langchain.chains import LLMChain
4
+ from langchain.prompts.chat import (
5
+ ChatPromptTemplate,
6
+ HumanMessagePromptTemplate,
7
+ )
8
+
9
+ model = get_ChatOpenAI(model_name=LLM_MODELS[0], temperature=TEMPERATURE)
10
+
11
+
12
+ human_prompt = "{input}"
13
+ human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)
14
+
15
+ chat_prompt = ChatPromptTemplate.from_messages(
16
+ [("human", "我们来玩成语接龙,我先来,生龙活虎"),
17
+ ("ai", "虎头虎脑"),
18
+ ("human", "{input}")])
19
+
20
+
21
+ chain = LLMChain(prompt=chat_prompt, llm=model, verbose=True)
22
+ print(chain({"input": "恼羞成怒"}))
common/__init__.py ADDED
File without changes
configs/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from .basic_config import *
2
+ from .model_config import *
3
+ from .kb_config import *
4
+ from .server_config import *
5
+ from .prompt_config import *
6
+
7
+
8
+ VERSION = "v0.2.10"
configs/basic_config.py.example ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import langchain
4
+ import tempfile
5
+ import shutil
6
+
7
+
8
+ # 是否显示详细日志
9
+ log_verbose = False
10
+ langchain.verbose = False
11
+
12
+ # 通常情况下不需要更改以下内容
13
+
14
+ # 日志格式
15
+ LOG_FORMAT = "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s"
16
+ logger = logging.getLogger()
17
+ logger.setLevel(logging.INFO)
18
+ logging.basicConfig(format=LOG_FORMAT)
19
+
20
+
21
+ # 日志存储路径
22
+ LOG_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs")
23
+ if not os.path.exists(LOG_PATH):
24
+ os.mkdir(LOG_PATH)
25
+
26
+ # 临时文件目录,主要用于文件对话
27
+ BASE_TEMP_DIR = os.path.join(tempfile.gettempdir(), "chatchat")
28
+ try:
29
+ shutil.rmtree(BASE_TEMP_DIR)
30
+ except Exception:
31
+ pass
32
+ os.makedirs(BASE_TEMP_DIR, exist_ok=True)
configs/kb_config.py.example ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # 默认使用的知识库
4
+ DEFAULT_KNOWLEDGE_BASE = "samples"
5
+
6
+ # 默认向量库/全文检索引擎类型。可选:faiss, milvus(离线) & zilliz(在线), pgvector,全文检索引擎es
7
+ DEFAULT_VS_TYPE = "faiss"
8
+
9
+ # 缓存向量库数量(针对FAISS)
10
+ CACHED_VS_NUM = 1
11
+
12
+ # 缓存临时向量库数量(针对FAISS),用于文件对话
13
+ CACHED_MEMO_VS_NUM = 10
14
+
15
+ # 知识库中单段文本长度(不适用MarkdownHeaderTextSplitter)
16
+ CHUNK_SIZE = 250
17
+
18
+ # 知识库中相邻文本重合长度(不适用MarkdownHeaderTextSplitter)
19
+ OVERLAP_SIZE = 50
20
+
21
+ # 知识库匹配向量数量
22
+ VECTOR_SEARCH_TOP_K = 3
23
+
24
+ # 知识库匹配的距离阈值,一般取值范围在0-1之间,SCORE越小,距离越小从而相关度越高。
25
+ # 但有用户报告遇到过匹配分值超过1的情况,为了兼容性默认设为1,在WEBUI中调整范围为0-2
26
+ SCORE_THRESHOLD = 1.0
27
+
28
+ # 默认搜索引擎。可选:bing, duckduckgo, metaphor
29
+ DEFAULT_SEARCH_ENGINE = "duckduckgo"
30
+
31
+ # 搜索引擎匹配结题数量
32
+ SEARCH_ENGINE_TOP_K = 3
33
+
34
+
35
+ # Bing 搜索必备变量
36
+ # 使用 Bing 搜索需要使用 Bing Subscription Key,需要在azure port中申请试用bing search
37
+ # 具体申请方式请见
38
+ # https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/create-bing-search-service-resource
39
+ # 使用python创建bing api 搜索实例详见:
40
+ # https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/quickstarts/rest/python
41
+ BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
42
+ # 注意不是bing Webmaster Tools的api key,
43
+
44
+ # 此外,如果是在服务器上,报Failed to establish a new connection: [Errno 110] Connection timed out
45
+ # 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG
46
+ BING_SUBSCRIPTION_KEY = ""
47
+
48
+ # metaphor搜索需要KEY
49
+ METAPHOR_API_KEY = ""
50
+
51
+ # 心知天气 API KEY,用于天气Agent。申请:https://www.seniverse.com/
52
+ SENIVERSE_API_KEY = ""
53
+
54
+ # 是否开启中文标题加强,以及标题增强的相关配置
55
+ # 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记;
56
+ # 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。
57
+ ZH_TITLE_ENHANCE = False
58
+
59
+ # PDF OCR 控制:只对宽高超过页面一定比例(图片宽/页面宽,图片高/页面高)的图片进行 OCR。
60
+ # 这样可以避免 PDF 中一些小图片的干扰,提高非扫描版 PDF 处理速度
61
+ PDF_OCR_THRESHOLD = (0.6, 0.6)
62
+
63
+ # 每个知识库的初始化介绍,用于在初始化知识库时显示和Agent调用,没写则没有介绍,不会被Agent调用。
64
+ KB_INFO = {
65
+ "知识库名称": "知识库介绍",
66
+ "samples": "关于本项目issue的解答",
67
+ }
68
+
69
+
70
+ # 通常情况下不需要更改以下内容
71
+
72
+ # 知识库默认存储路径
73
+ KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base")
74
+ if not os.path.exists(KB_ROOT_PATH):
75
+ os.mkdir(KB_ROOT_PATH)
76
+ # 数据库默认存储路径。
77
+ # 如果使用sqlite,可以直接修改DB_ROOT_PATH;如果使用其它数据库,请直接修改SQLALCHEMY_DATABASE_URI。
78
+ DB_ROOT_PATH = os.path.join(KB_ROOT_PATH, "info.db")
79
+ SQLALCHEMY_DATABASE_URI = f"sqlite:///{DB_ROOT_PATH}"
80
+
81
+ # 可选向量库类型及对应配置
82
+ kbs_config = {
83
+ "faiss": {
84
+ },
85
+ "milvus": {
86
+ "host": "127.0.0.1",
87
+ "port": "19530",
88
+ "user": "",
89
+ "password": "",
90
+ "secure": False,
91
+ },
92
+ "zilliz": {
93
+ "host": "in01-a7ce524e41e3935.ali-cn-hangzhou.vectordb.zilliz.com.cn",
94
+ "port": "19530",
95
+ "user": "",
96
+ "password": "",
97
+ "secure": True,
98
+ },
99
+ "pg": {
100
+ "connection_uri": "postgresql://postgres:[email protected]:5432/langchain_chatchat",
101
+ },
102
+
103
+ "es": {
104
+ "host": "127.0.0.1",
105
+ "port": "9200",
106
+ "index_name": "test_index",
107
+ "user": "",
108
+ "password": ""
109
+ },
110
+ "milvus_kwargs":{
111
+ "search_params":{"metric_type": "L2"}, #在此处增加search_params
112
+ "index_params":{"metric_type": "L2","index_type": "HNSW"} # 在此处增加index_params
113
+ }
114
+ }
115
+
116
+ # TextSplitter配置项,如果你不明白其中的含义,就不要修改。
117
+ text_splitter_dict = {
118
+ "ChineseRecursiveTextSplitter": {
119
+ "source": "huggingface", # 选择tiktoken则使用openai的方法
120
+ "tokenizer_name_or_path": "",
121
+ },
122
+ "SpacyTextSplitter": {
123
+ "source": "huggingface",
124
+ "tokenizer_name_or_path": "gpt2",
125
+ },
126
+ "RecursiveCharacterTextSplitter": {
127
+ "source": "tiktoken",
128
+ "tokenizer_name_or_path": "cl100k_base",
129
+ },
130
+ "MarkdownHeaderTextSplitter": {
131
+ "headers_to_split_on":
132
+ [
133
+ ("#", "head1"),
134
+ ("##", "head2"),
135
+ ("###", "head3"),
136
+ ("####", "head4"),
137
+ ]
138
+ },
139
+ }
140
+
141
+ # TEXT_SPLITTER 名称
142
+ TEXT_SPLITTER_NAME = "ChineseRecursiveTextSplitter"
143
+
144
+ # Embedding模型定制��语的词表文件
145
+ EMBEDDING_KEYWORD_FILE = "embedding_keywords.txt"
configs/model_config.py.example ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # 可以指定一个绝对路径,统一存放所有的Embedding和LLM模型。
4
+ # 每个模型可以是一个单独的目录,也可以是某个目录下的二级子目录。
5
+ # 如果模型目录名称和 MODEL_PATH 中的 key 或 value 相同,程序会自动检测加载,无需修改 MODEL_PATH 中的路径。
6
+ MODEL_ROOT_PATH = ""
7
+
8
+ # 选用的 Embedding 名称
9
+ EMBEDDING_MODEL = "bge-large-zh-v1.5"
10
+
11
+ # Embedding 模型运行设备。设为 "auto" 会自动检测(会有警告),也可手动设定为 "cuda","mps","cpu","xpu" 其中之一。
12
+ EMBEDDING_DEVICE = "auto"
13
+
14
+ # 选用的reranker模型
15
+ RERANKER_MODEL = "bge-reranker-large"
16
+ # 是否启用reranker模型
17
+ USE_RERANKER = False
18
+ RERANKER_MAX_LENGTH = 1024
19
+
20
+ # 如果需要在 EMBEDDING_MODEL 中增加自定义的关键字时配置
21
+ EMBEDDING_KEYWORD_FILE = "keywords.txt"
22
+ EMBEDDING_MODEL_OUTPUT_PATH = "output"
23
+
24
+ # 要运行的 LLM 名称,可以包括本地模型和在线模型。列表中本地模型将在启动项目时全部加载。
25
+ # 列表中第一个模型将作为 API 和 WEBUI 的默认模型。
26
+ # 在这里,我们使用目前主流的两个离线模型,其中,chatglm3-6b 为默认加载模型。
27
+ # 如果你的显存不足,可使用 Qwen-1_8B-Chat, 该模型 FP16 仅需 3.8G显存。
28
+
29
+ LLM_MODELS = ["chatglm3-6b", "zhipu-api", "openai-api"]
30
+ Agent_MODEL = None
31
+
32
+ # LLM 模型运行设备。设为"auto"会自动检测(会有警告),也可手动设定为 "cuda","mps","cpu","xpu" 其中之一。
33
+ LLM_DEVICE = "auto"
34
+
35
+ HISTORY_LEN = 3
36
+
37
+ MAX_TOKENS = 2048
38
+
39
+ TEMPERATURE = 0.7
40
+
41
+ ONLINE_LLM_MODEL = {
42
+ "openai-api": {
43
+ "model_name": "gpt-4",
44
+ "api_base_url": "https://api.openai.com/v1",
45
+ "api_key": "",
46
+ "openai_proxy": "",
47
+ },
48
+
49
+ # 智谱AI API,具体注册及api key获取请前往 http://open.bigmodel.cn
50
+ "zhipu-api": {
51
+ "api_key": "",
52
+ "version": "glm-4",
53
+ "provider": "ChatGLMWorker",
54
+ },
55
+
56
+ # 具体注册及api key获取请前往 https://api.minimax.chat/
57
+ "minimax-api": {
58
+ "group_id": "",
59
+ "api_key": "",
60
+ "is_pro": False,
61
+ "provider": "MiniMaxWorker",
62
+ },
63
+
64
+ # 具体注册及api key获取请前往 https://xinghuo.xfyun.cn/
65
+ "xinghuo-api": {
66
+ "APPID": "",
67
+ "APISecret": "",
68
+ "api_key": "",
69
+ "version": "v3.0", # 你使用的讯飞星火大模型版本,可选包括 "v3.0", "v2.0", "v1.5"
70
+ "provider": "XingHuoWorker",
71
+ },
72
+
73
+ # 百度千帆 API,申请方式请参考 https://cloud.baidu.com/doc/WENXINWORKSHOP/s/4lilb2lpf
74
+ "qianfan-api": {
75
+ "version": "ERNIE-Bot", # 注意大小写。当前支持 "ERNIE-Bot" 或 "ERNIE-Bot-turbo", 更多的见官方文档。
76
+ "version_url": "", # 也可以不填写version,直接填写在千帆申请模型发布的API地址
77
+ "api_key": "",
78
+ "secret_key": "",
79
+ "provider": "QianFanWorker",
80
+ },
81
+
82
+ # 火山方舟 API,文档参考 https://www.volcengine.com/docs/82379
83
+ "fangzhou-api": {
84
+ "version": "chatglm-6b-model",
85
+ "version_url": "",
86
+ "api_key": "",
87
+ "secret_key": "",
88
+ "provider": "FangZhouWorker",
89
+ },
90
+
91
+ # 阿里云通义千问 API,文档参考 https://help.aliyun.com/zh/dashscope/developer-reference/api-details
92
+ "qwen-api": {
93
+ "version": "qwen-max",
94
+ "api_key": "",
95
+ "provider": "QwenWorker",
96
+ "embed_model": "text-embedding-v1" # embedding 模型名称
97
+ },
98
+
99
+ # 百川 API,申请方式请参考 https://www.baichuan-ai.com/home#api-enter
100
+ "baichuan-api": {
101
+ "version": "Baichuan2-53B",
102
+ "api_key": "",
103
+ "secret_key": "",
104
+ "provider": "BaiChuanWorker",
105
+ },
106
+
107
+ # Azure API
108
+ "azure-api": {
109
+ "deployment_name": "", # 部署容器的名字
110
+ "resource_name": "", # https://{resource_name}.openai.azure.com/openai/ 填写resource_name的部分,其他部分不要填写
111
+ "api_version": "", # API的版本,不是模型版本
112
+ "api_key": "",
113
+ "provider": "AzureWorker",
114
+ },
115
+
116
+ # 昆仑万维天工 API https://model-platform.tiangong.cn/
117
+ "tiangong-api": {
118
+ "version": "SkyChat-MegaVerse",
119
+ "api_key": "",
120
+ "secret_key": "",
121
+ "provider": "TianGongWorker",
122
+ },
123
+ # Gemini API https://makersuite.google.com/app/apikey
124
+ "gemini-api": {
125
+ "api_key": "",
126
+ "provider": "GeminiWorker",
127
+ }
128
+
129
+ }
130
+
131
+ # 在以下字典中修改属性值,以指定本地embedding模型存储位置。支持3种设置方法:
132
+ # 1、将对应的值修改为模型绝对路径
133
+ # 2、不修改此处的值(以 text2vec 为例):
134
+ # 2.1 如果{MODEL_ROOT_PATH}下存在如下任一子目录:
135
+ # - text2vec
136
+ # - GanymedeNil/text2vec-large-chinese
137
+ # - text2vec-large-chinese
138
+ # 2.2 如果以上本地路径不存在,则使用huggingface模型
139
+
140
+ MODEL_PATH = {
141
+ "embed_model": {
142
+ "ernie-tiny": "nghuyong/ernie-3.0-nano-zh",
143
+ "ernie-base": "nghuyong/ernie-3.0-base-zh",
144
+ "text2vec-base": "shibing624/text2vec-base-chinese",
145
+ "text2vec": "GanymedeNil/text2vec-large-chinese",
146
+ "text2vec-paraphrase": "shibing624/text2vec-base-chinese-paraphrase",
147
+ "text2vec-sentence": "shibing624/text2vec-base-chinese-sentence",
148
+ "text2vec-multilingual": "shibing624/text2vec-base-multilingual",
149
+ "text2vec-bge-large-chinese": "shibing624/text2vec-bge-large-chinese",
150
+ "m3e-small": "moka-ai/m3e-small",
151
+ "m3e-base": "moka-ai/m3e-base",
152
+ "m3e-large": "moka-ai/m3e-large",
153
+ "bge-small-zh": "BAAI/bge-small-zh",
154
+ "bge-base-zh": "BAAI/bge-base-zh",
155
+ "bge-large-zh": "BAAI/bge-large-zh",
156
+ "bge-large-zh-noinstruct": "BAAI/bge-large-zh-noinstruct",
157
+ "bge-base-zh-v1.5": "BAAI/bge-base-zh-v1.5",
158
+ "bge-large-zh-v1.5": "BAAI/bge-large-zh-v1.5",
159
+ "piccolo-base-zh": "sensenova/piccolo-base-zh",
160
+ "piccolo-large-zh": "sensenova/piccolo-large-zh",
161
+ "nlp_gte_sentence-embedding_chinese-large": "damo/nlp_gte_sentence-embedding_chinese-large",
162
+ "text-embedding-ada-002": "your OPENAI_API_KEY",
163
+ },
164
+
165
+ "llm_model": {
166
+ "chatglm2-6b": "THUDM/chatglm2-6b",
167
+ "chatglm2-6b-32k": "THUDM/chatglm2-6b-32k",
168
+ "chatglm3-6b": "THUDM/chatglm3-6b",
169
+ "chatglm3-6b-32k": "THUDM/chatglm3-6b-32k",
170
+
171
+ "Orion-14B-Chat": "OrionStarAI/Orion-14B-Chat",
172
+ "Orion-14B-Chat-Plugin": "OrionStarAI/Orion-14B-Chat-Plugin",
173
+ "Orion-14B-LongChat": "OrionStarAI/Orion-14B-LongChat",
174
+
175
+ "Llama-2-7b-chat-hf": "meta-llama/Llama-2-7b-chat-hf",
176
+ "Llama-2-13b-chat-hf": "meta-llama/Llama-2-13b-chat-hf",
177
+ "Llama-2-70b-chat-hf": "meta-llama/Llama-2-70b-chat-hf",
178
+
179
+ "Qwen-1_8B-Chat": "Qwen/Qwen-1_8B-Chat",
180
+ "Qwen-7B-Chat": "Qwen/Qwen-7B-Chat",
181
+ "Qwen-14B-Chat": "Qwen/Qwen-14B-Chat",
182
+ "Qwen-72B-Chat": "Qwen/Qwen-72B-Chat",
183
+
184
+ "baichuan-7b-chat": "baichuan-inc/Baichuan-7B-Chat",
185
+ "baichuan-13b-chat": "baichuan-inc/Baichuan-13B-Chat",
186
+ "baichuan2-7b-chat": "baichuan-inc/Baichuan2-7B-Chat",
187
+ "baichuan2-13b-chat": "baichuan-inc/Baichuan2-13B-Chat",
188
+
189
+ "internlm-7b": "internlm/internlm-7b",
190
+ "internlm-chat-7b": "internlm/internlm-chat-7b",
191
+ "internlm2-chat-7b": "internlm/internlm2-chat-7b",
192
+ "internlm2-chat-20b": "internlm/internlm2-chat-20b",
193
+
194
+ "BlueLM-7B-Chat": "vivo-ai/BlueLM-7B-Chat",
195
+ "BlueLM-7B-Chat-32k": "vivo-ai/BlueLM-7B-Chat-32k",
196
+
197
+ "Yi-34B-Chat": "https://huggingface.co/01-ai/Yi-34B-Chat",
198
+
199
+ "agentlm-7b": "THUDM/agentlm-7b",
200
+ "agentlm-13b": "THUDM/agentlm-13b",
201
+ "agentlm-70b": "THUDM/agentlm-70b",
202
+
203
+ "falcon-7b": "tiiuae/falcon-7b",
204
+ "falcon-40b": "tiiuae/falcon-40b",
205
+ "falcon-rw-7b": "tiiuae/falcon-rw-7b",
206
+
207
+ "aquila-7b": "BAAI/Aquila-7B",
208
+ "aquilachat-7b": "BAAI/AquilaChat-7B",
209
+ "open_llama_13b": "openlm-research/open_llama_13b",
210
+ "vicuna-13b-v1.5": "lmsys/vicuna-13b-v1.5",
211
+ "koala": "young-geng/koala",
212
+ "mpt-7b": "mosaicml/mpt-7b",
213
+ "mpt-7b-storywriter": "mosaicml/mpt-7b-storywriter",
214
+ "mpt-30b": "mosaicml/mpt-30b",
215
+ "opt-66b": "facebook/opt-66b",
216
+ "opt-iml-max-30b": "facebook/opt-iml-max-30b",
217
+ "gpt2": "gpt2",
218
+ "gpt2-xl": "gpt2-xl",
219
+ "gpt-j-6b": "EleutherAI/gpt-j-6b",
220
+ "gpt4all-j": "nomic-ai/gpt4all-j",
221
+ "gpt-neox-20b": "EleutherAI/gpt-neox-20b",
222
+ "pythia-12b": "EleutherAI/pythia-12b",
223
+ "oasst-sft-4-pythia-12b-epoch-3.5": "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
224
+ "dolly-v2-12b": "databricks/dolly-v2-12b",
225
+ "stablelm-tuned-alpha-7b": "stabilityai/stablelm-tuned-alpha-7b",
226
+ },
227
+
228
+ "reranker": {
229
+ "bge-reranker-large": "BAAI/bge-reranker-large",
230
+ "bge-reranker-base": "BAAI/bge-reranker-base",
231
+ }
232
+ }
233
+
234
+ # 通常情况下不需要更改以下内容
235
+
236
+ # nltk 模型存储路径
237
+ NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data")
238
+
239
+ # 使用VLLM可能导致模型推理能力下降,无法完成Agent任务
240
+ VLLM_MODEL_DICT = {
241
+ "chatglm2-6b": "THUDM/chatglm2-6b",
242
+ "chatglm2-6b-32k": "THUDM/chatglm2-6b-32k",
243
+ "chatglm3-6b": "THUDM/chatglm3-6b",
244
+ "chatglm3-6b-32k": "THUDM/chatglm3-6b-32k",
245
+
246
+ "Llama-2-7b-chat-hf": "meta-llama/Llama-2-7b-chat-hf",
247
+ "Llama-2-13b-chat-hf": "meta-llama/Llama-2-13b-chat-hf",
248
+ "Llama-2-70b-chat-hf": "meta-llama/Llama-2-70b-chat-hf",
249
+
250
+ "Qwen-1_8B-Chat": "Qwen/Qwen-1_8B-Chat",
251
+ "Qwen-7B-Chat": "Qwen/Qwen-7B-Chat",
252
+ "Qwen-14B-Chat": "Qwen/Qwen-14B-Chat",
253
+ "Qwen-72B-Chat": "Qwen/Qwen-72B-Chat",
254
+
255
+ "baichuan-7b-chat": "baichuan-inc/Baichuan-7B-Chat",
256
+ "baichuan-13b-chat": "baichuan-inc/Baichuan-13B-Chat",
257
+ "baichuan2-7b-chat": "baichuan-inc/Baichuan-7B-Chat",
258
+ "baichuan2-13b-chat": "baichuan-inc/Baichuan-13B-Chat",
259
+
260
+ "BlueLM-7B-Chat": "vivo-ai/BlueLM-7B-Chat",
261
+ "BlueLM-7B-Chat-32k": "vivo-ai/BlueLM-7B-Chat-32k",
262
+
263
+ "internlm-7b": "internlm/internlm-7b",
264
+ "internlm-chat-7b": "internlm/internlm-chat-7b",
265
+ "internlm2-chat-7b": "internlm/Models/internlm2-chat-7b",
266
+ "internlm2-chat-20b": "internlm/Models/internlm2-chat-20b",
267
+
268
+ "aquila-7b": "BAAI/Aquila-7B",
269
+ "aquilachat-7b": "BAAI/AquilaChat-7B",
270
+
271
+ "falcon-7b": "tiiuae/falcon-7b",
272
+ "falcon-40b": "tiiuae/falcon-40b",
273
+ "falcon-rw-7b": "tiiuae/falcon-rw-7b",
274
+ "gpt2": "gpt2",
275
+ "gpt2-xl": "gpt2-xl",
276
+ "gpt-j-6b": "EleutherAI/gpt-j-6b",
277
+ "gpt4all-j": "nomic-ai/gpt4all-j",
278
+ "gpt-neox-20b": "EleutherAI/gpt-neox-20b",
279
+ "pythia-12b": "EleutherAI/pythia-12b",
280
+ "oasst-sft-4-pythia-12b-epoch-3.5": "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
281
+ "dolly-v2-12b": "databricks/dolly-v2-12b",
282
+ "stablelm-tuned-alpha-7b": "stabilityai/stablelm-tuned-alpha-7b",
283
+ "open_llama_13b": "openlm-research/open_llama_13b",
284
+ "vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3",
285
+ "koala": "young-geng/koala",
286
+ "mpt-7b": "mosaicml/mpt-7b",
287
+ "mpt-7b-storywriter": "mosaicml/mpt-7b-storywriter",
288
+ "mpt-30b": "mosaicml/mpt-30b",
289
+ "opt-66b": "facebook/opt-66b",
290
+ "opt-iml-max-30b": "facebook/opt-iml-max-30b",
291
+
292
+ }
293
+
294
+ SUPPORT_AGENT_MODEL = [
295
+ "openai-api", # GPT4 模型
296
+ "qwen-api", # Qwen Max模型
297
+ "zhipu-api", # 智谱AI GLM4模型
298
+ "Qwen", # 所有Qwen系列本地模型
299
+ "chatglm3-6b",
300
+ "internlm2-chat-20b",
301
+ "Orion-14B-Chat-Plugin",
302
+ ]
configs/prompt_config.py.example ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prompt模板使用Jinja2语法,简单点就是用双大括号代替f-string的单大括号
2
+ # 本配置文件支持热加载,修改prompt模板后无需重启服务。
3
+
4
+ # LLM对话支持的变量:
5
+ # - input: 用户输入内容
6
+
7
+ # 知识库和搜索引擎对话支持的变量:
8
+ # - context: 从检索结果拼接的知识文本
9
+ # - question: 用户提出的问题
10
+
11
+ # Agent对话支持的变量:
12
+
13
+ # - tools: 可用的工具列表
14
+ # - tool_names: 可用的工具名称列表
15
+ # - history: 用户和Agent的对话历史
16
+ # - input: 用户输入内容
17
+ # - agent_scratchpad: Agent的思维记录
18
+
19
+ PROMPT_TEMPLATES = {
20
+ "llm_chat": {
21
+ "default":
22
+ '{{ input }}',
23
+
24
+ "with_history":
25
+ 'The following is a friendly conversation between a human and an AI. '
26
+ 'The AI is talkative and provides lots of specific details from its context. '
27
+ 'If the AI does not know the answer to a question, it truthfully says it does not know.\n\n'
28
+ 'Current conversation:\n'
29
+ '{history}\n'
30
+ 'Human: {input}\n'
31
+ 'AI:',
32
+
33
+ "py":
34
+ '你是一个聪明的代码助手,请你给我写出简单的py代码。 \n'
35
+ '{{ input }}',
36
+ },
37
+
38
+
39
+ "knowledge_base_chat": {
40
+ "default":
41
+ '<指令>根据已知信息,简洁和专业的来回答问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题”,'
42
+ '不允许在答案中添加编造成分,答案请使用中文。 </指令>\n'
43
+ '<已知信息>{{ context }}</已知信息>\n'
44
+ '<问题>{{ question }}</问题>\n',
45
+
46
+ "text":
47
+ '<指令>根据已知信息,简洁和专业的来回答问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题”,答案请使用中文。 </指令>\n'
48
+ '<已知信息>{{ context }}</已知信息>\n'
49
+ '<问题>{{ question }}</问题>\n',
50
+
51
+ "empty": # 搜不到知识库的时候使用
52
+ '请你回答我的问题:\n'
53
+ '{{ question }}\n\n',
54
+ },
55
+
56
+
57
+ "search_engine_chat": {
58
+ "default":
59
+ '<指令>这是我搜索到的互联网信息,请你根据这些信息进行提取并有调理,简洁的回答问题。'
60
+ '如果无法从中得到答案,请说 “无法搜索到能回答问题的内容”。 </指令>\n'
61
+ '<已知信息>{{ context }}</已知信息>\n'
62
+ '<问题>{{ question }}</问题>\n',
63
+
64
+ "search":
65
+ '<指令>根据已知信息,简洁和专业的来回答问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题”,答案请使用中文。 </指令>\n'
66
+ '<已知信息>{{ context }}</已知信息>\n'
67
+ '<问题>{{ question }}</问题>\n',
68
+ },
69
+
70
+
71
+ "agent_chat": {
72
+ "default":
73
+ 'Answer the following questions as best you can. If it is in order, you can use some tools appropriately. '
74
+ 'You have access to the following tools:\n\n'
75
+ '{tools}\n\n'
76
+ 'Use the following format:\n'
77
+ 'Question: the input question you must answer1\n'
78
+ 'Thought: you should always think about what to do and what tools to use.\n'
79
+ 'Action: the action to take, should be one of [{tool_names}]\n'
80
+ 'Action Input: the input to the action\n'
81
+ 'Observation: the result of the action\n'
82
+ '... (this Thought/Action/Action Input/Observation can be repeated zero or more times)\n'
83
+ 'Thought: I now know the final answer\n'
84
+ 'Final Answer: the final answer to the original input question\n'
85
+ 'Begin!\n\n'
86
+ 'history: {history}\n\n'
87
+ 'Question: {input}\n\n'
88
+ 'Thought: {agent_scratchpad}\n',
89
+
90
+ "ChatGLM3":
91
+ 'You can answer using the tools, or answer directly using your knowledge without using the tools. '
92
+ 'Respond to the human as helpfully and accurately as possible.\n'
93
+ 'You have access to the following tools:\n'
94
+ '{tools}\n'
95
+ 'Use a json blob to specify a tool by providing an action key (tool name) '
96
+ 'and an action_input key (tool input).\n'
97
+ 'Valid "action" values: "Final Answer" or [{tool_names}]'
98
+ 'Provide only ONE action per $JSON_BLOB, as shown:\n\n'
99
+ '```\n'
100
+ '{{{{\n'
101
+ ' "action": $TOOL_NAME,\n'
102
+ ' "action_input": $INPUT\n'
103
+ '}}}}\n'
104
+ '```\n\n'
105
+ 'Follow this format:\n\n'
106
+ 'Question: input question to answer\n'
107
+ 'Thought: consider previous and subsequent steps\n'
108
+ 'Action:\n'
109
+ '```\n'
110
+ '$JSON_BLOB\n'
111
+ '```\n'
112
+ 'Observation: action result\n'
113
+ '... (repeat Thought/Action/Observation N times)\n'
114
+ 'Thought: I know what to respond\n'
115
+ 'Action:\n'
116
+ '```\n'
117
+ '{{{{\n'
118
+ ' "action": "Final Answer",\n'
119
+ ' "action_input": "Final response to human"\n'
120
+ '}}}}\n'
121
+ 'Begin! Reminder to ALWAYS respond with a valid json blob of a single action. Use tools if necessary. '
122
+ 'Respond directly if appropriate. Format is Action:```$JSON_BLOB```then Observation:.\n'
123
+ 'history: {history}\n\n'
124
+ 'Question: {input}\n\n'
125
+ 'Thought: {agent_scratchpad}',
126
+ }
127
+ }
configs/server_config.py.example ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from configs.model_config import LLM_DEVICE
3
+
4
+ # httpx 请求默认超时时间(秒)。如果加载模型或对话较慢,出现超时错误,可以适当加大该值。
5
+ HTTPX_DEFAULT_TIMEOUT = 300.0
6
+
7
+ # API 是否开启跨域,默认为False,如果需要开启,请设置为True
8
+ # is open cross domain
9
+ OPEN_CROSS_DOMAIN = False
10
+
11
+ # 各服务器默认绑定host。如改为"0.0.0.0"需要修改下方所有XX_SERVER的host
12
+ DEFAULT_BIND_HOST = "0.0.0.0" if sys.platform != "win32" else "127.0.0.1"
13
+
14
+ # webui.py server
15
+ WEBUI_SERVER = {
16
+ "host": DEFAULT_BIND_HOST,
17
+ "port": 8501,
18
+ }
19
+
20
+ # api.py server
21
+ API_SERVER = {
22
+ "host": DEFAULT_BIND_HOST,
23
+ "port": 7861,
24
+ }
25
+
26
+ # fastchat openai_api server
27
+ FSCHAT_OPENAI_API = {
28
+ "host": DEFAULT_BIND_HOST,
29
+ "port": 20000,
30
+ }
31
+
32
+ # fastchat model_worker server
33
+ # 这些模型必须是在model_config.MODEL_PATH或ONLINE_MODEL中正确配置的。
34
+ # 在启动startup.py时,可用通过`--model-name xxxx yyyy`指定模型,不指定则为LLM_MODELS
35
+ FSCHAT_MODEL_WORKERS = {
36
+ # 所有模型共用的默认配置,可在模型专项配置中进行覆盖。
37
+ "default": {
38
+ "host": DEFAULT_BIND_HOST,
39
+ "port": 20002,
40
+ "device": LLM_DEVICE,
41
+ # False,'vllm',使用的推理加速框架,使用vllm如果出现HuggingFace通信问题,参见doc/FAQ
42
+ # vllm对一些模型支持还不成熟,暂时默认关闭
43
+ "infer_turbo": False,
44
+
45
+ # model_worker多卡加载需要配置的参数
46
+ # "gpus": None, # 使用的GPU,以str的格式指定,如"0,1",如失效请使用CUDA_VISIBLE_DEVICES="0,1"等形式指定
47
+ # "num_gpus": 1, # 使用GPU的数量
48
+ # "max_gpu_memory": "20GiB", # 每个GPU占用的最大显存
49
+
50
+ # 以下为model_worker非常用参数,可根据需要配置
51
+ # "load_8bit": False, # 开启8bit量化
52
+ # "cpu_offloading": None,
53
+ # "gptq_ckpt": None,
54
+ # "gptq_wbits": 16,
55
+ # "gptq_groupsize": -1,
56
+ # "gptq_act_order": False,
57
+ # "awq_ckpt": None,
58
+ # "awq_wbits": 16,
59
+ # "awq_groupsize": -1,
60
+ # "model_names": LLM_MODELS,
61
+ # "conv_template": None,
62
+ # "limit_worker_concurrency": 5,
63
+ # "stream_interval": 2,
64
+ # "no_register": False,
65
+ # "embed_in_truncate": False,
66
+
67
+ # 以下为vllm_worker配置参数,注意使用vllm必须有gpu,仅在Linux测试通过
68
+
69
+ # tokenizer = model_path # 如果tokenizer与model_path不一致在此处添加
70
+ # 'tokenizer_mode':'auto',
71
+ # 'trust_remote_code':True,
72
+ # 'download_dir':None,
73
+ # 'load_format':'auto',
74
+ # 'dtype':'auto',
75
+ # 'seed':0,
76
+ # 'worker_use_ray':False,
77
+ # 'pipeline_parallel_size':1,
78
+ # 'tensor_parallel_size':1,
79
+ # 'block_size':16,
80
+ # 'swap_space':4 , # GiB
81
+ # 'gpu_memory_utilization':0.90,
82
+ # 'max_num_batched_tokens':2560,
83
+ # 'max_num_seqs':256,
84
+ # 'disable_log_stats':False,
85
+ # 'conv_template':None,
86
+ # 'limit_worker_concurrency':5,
87
+ # 'no_register':False,
88
+ # 'num_gpus': 1
89
+ # 'engine_use_ray': False,
90
+ # 'disable_log_requests': False
91
+
92
+ },
93
+ "Qwen-1_8B-Chat": {
94
+ "device": "cpu",
95
+ },
96
+ "chatglm3-6b": {
97
+ "device": "cuda",
98
+ },
99
+
100
+ # 以下配置可以不用修改,在model_config中设置启动的模型
101
+ "zhipu-api": {
102
+ "port": 21001,
103
+ },
104
+ "minimax-api": {
105
+ "port": 21002,
106
+ },
107
+ "xinghuo-api": {
108
+ "port": 21003,
109
+ },
110
+ "qianfan-api": {
111
+ "port": 21004,
112
+ },
113
+ "fangzhou-api": {
114
+ "port": 21005,
115
+ },
116
+ "qwen-api": {
117
+ "port": 21006,
118
+ },
119
+ "baichuan-api": {
120
+ "port": 21007,
121
+ },
122
+ "azure-api": {
123
+ "port": 21008,
124
+ },
125
+ "tiangong-api": {
126
+ "port": 21009,
127
+ },
128
+ "gemini-api": {
129
+ "port": 21010,
130
+ },
131
+ }
132
+
133
+ FSCHAT_CONTROLLER = {
134
+ "host": DEFAULT_BIND_HOST,
135
+ "port": 20001,
136
+ "dispatch_method": "shortest_queue",
137
+ }
docs/ES部署指南.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # 实现基于ES的数据插入、检索、删除、更新
3
+ ```shell
4
+ author: 唐国梁Tommy
5
+ e-mail: [email protected]
6
+
7
+ 如果遇到任何问题,可以与我联系,我这边部署后服务是没有问题的。
8
+ ```
9
+
10
+ ## 第1步:ES docker部署
11
+ ```shell
12
+ docker network create elastic
13
+ docker run -id --name elasticsearch --net elastic -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -e "xpack.security.http.ssl.enabled=false" -t docker.elastic.co/elasticsearch/elasticsearch:8.8.2
14
+ ```
15
+
16
+ ### 第2步:Kibana docker部署
17
+ **注意:Kibana版本与ES保持一致**
18
+ ```shell
19
+ docker pull docker.elastic.co/kibana/kibana:{version}
20
+ docker run --name kibana --net elastic -p 5601:5601 docker.elastic.co/kibana/kibana:{version}
21
+ ```
22
+
23
+ ### 第3步:核心代码
24
+ ```shell
25
+ 1. 核心代码路径
26
+ server/knowledge_base/kb_service/es_kb_service.py
27
+
28
+ 2. 需要在 configs/model_config.py 中 配置 ES参数(IP, PORT)等;
29
+ ```
document_loaders/FilteredCSVloader.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## 指定制定列的csv文件加载器
2
+
3
+ from langchain.document_loaders import CSVLoader
4
+ import csv
5
+ from io import TextIOWrapper
6
+ from typing import Dict, List, Optional
7
+ from langchain.docstore.document import Document
8
+ from langchain.document_loaders.helpers import detect_file_encodings
9
+
10
+
11
+ class FilteredCSVLoader(CSVLoader):
12
+ def __init__(
13
+ self,
14
+ file_path: str,
15
+ columns_to_read: List[str],
16
+ source_column: Optional[str] = None,
17
+ metadata_columns: List[str] = [],
18
+ csv_args: Optional[Dict] = None,
19
+ encoding: Optional[str] = None,
20
+ autodetect_encoding: bool = False,
21
+ ):
22
+ super().__init__(
23
+ file_path=file_path,
24
+ source_column=source_column,
25
+ metadata_columns=metadata_columns,
26
+ csv_args=csv_args,
27
+ encoding=encoding,
28
+ autodetect_encoding=autodetect_encoding,
29
+ )
30
+ self.columns_to_read = columns_to_read
31
+
32
+ def load(self) -> List[Document]:
33
+ """Load data into document objects."""
34
+
35
+ docs = []
36
+ try:
37
+ with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
38
+ docs = self.__read_file(csvfile)
39
+ except UnicodeDecodeError as e:
40
+ if self.autodetect_encoding:
41
+ detected_encodings = detect_file_encodings(self.file_path)
42
+ for encoding in detected_encodings:
43
+ try:
44
+ with open(
45
+ self.file_path, newline="", encoding=encoding.encoding
46
+ ) as csvfile:
47
+ docs = self.__read_file(csvfile)
48
+ break
49
+ except UnicodeDecodeError:
50
+ continue
51
+ else:
52
+ raise RuntimeError(f"Error loading {self.file_path}") from e
53
+ except Exception as e:
54
+ raise RuntimeError(f"Error loading {self.file_path}") from e
55
+
56
+ return docs
57
+
58
+ def __read_file(self, csvfile: TextIOWrapper) -> List[Document]:
59
+ docs = []
60
+ csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
61
+ for i, row in enumerate(csv_reader):
62
+ if self.columns_to_read[0] in row:
63
+ content = row[self.columns_to_read[0]]
64
+ # Extract the source if available
65
+ source = (
66
+ row.get(self.source_column, None)
67
+ if self.source_column is not None
68
+ else self.file_path
69
+ )
70
+ metadata = {"source": source, "row": i}
71
+
72
+ for col in self.metadata_columns:
73
+ if col in row:
74
+ metadata[col] = row[col]
75
+
76
+ doc = Document(page_content=content, metadata=metadata)
77
+ docs.append(doc)
78
+ else:
79
+ raise ValueError(f"Column '{self.columns_to_read[0]}' not found in CSV file.")
80
+
81
+ return docs
document_loaders/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .mypdfloader import RapidOCRPDFLoader
2
+ from .myimgloader import RapidOCRLoader
3
+ from .mydocloader import RapidOCRDocLoader
4
+ from .mypptloader import RapidOCRPPTLoader
document_loaders/mydocloader.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders.unstructured import UnstructuredFileLoader
2
+ from typing import List
3
+ import tqdm
4
+
5
+
6
+ class RapidOCRDocLoader(UnstructuredFileLoader):
7
+ def _get_elements(self) -> List:
8
+ def doc2text(filepath):
9
+ from docx.table import _Cell, Table
10
+ from docx.oxml.table import CT_Tbl
11
+ from docx.oxml.text.paragraph import CT_P
12
+ from docx.text.paragraph import Paragraph
13
+ from docx import Document, ImagePart
14
+ from PIL import Image
15
+ from io import BytesIO
16
+ import numpy as np
17
+ from rapidocr_onnxruntime import RapidOCR
18
+ ocr = RapidOCR()
19
+ doc = Document(filepath)
20
+ resp = ""
21
+
22
+ def iter_block_items(parent):
23
+ from docx.document import Document
24
+ if isinstance(parent, Document):
25
+ parent_elm = parent.element.body
26
+ elif isinstance(parent, _Cell):
27
+ parent_elm = parent._tc
28
+ else:
29
+ raise ValueError("RapidOCRDocLoader parse fail")
30
+
31
+ for child in parent_elm.iterchildren():
32
+ if isinstance(child, CT_P):
33
+ yield Paragraph(child, parent)
34
+ elif isinstance(child, CT_Tbl):
35
+ yield Table(child, parent)
36
+
37
+ b_unit = tqdm.tqdm(total=len(doc.paragraphs)+len(doc.tables),
38
+ desc="RapidOCRDocLoader block index: 0")
39
+ for i, block in enumerate(iter_block_items(doc)):
40
+ b_unit.set_description(
41
+ "RapidOCRDocLoader block index: {}".format(i))
42
+ b_unit.refresh()
43
+ if isinstance(block, Paragraph):
44
+ resp += block.text.strip() + "\n"
45
+ images = block._element.xpath('.//pic:pic') # 获取所有图片
46
+ for image in images:
47
+ for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id
48
+ part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
49
+ if isinstance(part, ImagePart):
50
+ image = Image.open(BytesIO(part._blob))
51
+ result, _ = ocr(np.array(image))
52
+ if result:
53
+ ocr_result = [line[1] for line in result]
54
+ resp += "\n".join(ocr_result)
55
+ elif isinstance(block, Table):
56
+ for row in block.rows:
57
+ for cell in row.cells:
58
+ for paragraph in cell.paragraphs:
59
+ resp += paragraph.text.strip() + "\n"
60
+ b_unit.update(1)
61
+ return resp
62
+
63
+ text = doc2text(self.file_path)
64
+ from unstructured.partition.text import partition_text
65
+ return partition_text(text=text, **self.unstructured_kwargs)
66
+
67
+
68
+ if __name__ == '__main__':
69
+ loader = RapidOCRDocLoader(file_path="../tests/samples/ocr_test.docx")
70
+ docs = loader.load()
71
+ print(docs)
document_loaders/myimgloader.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from langchain.document_loaders.unstructured import UnstructuredFileLoader
3
+ from document_loaders.ocr import get_ocr
4
+
5
+
6
+ class RapidOCRLoader(UnstructuredFileLoader):
7
+ def _get_elements(self) -> List:
8
+ def img2text(filepath):
9
+ resp = ""
10
+ ocr = get_ocr()
11
+ result, _ = ocr(filepath)
12
+ if result:
13
+ ocr_result = [line[1] for line in result]
14
+ resp += "\n".join(ocr_result)
15
+ return resp
16
+
17
+ text = img2text(self.file_path)
18
+ from unstructured.partition.text import partition_text
19
+ return partition_text(text=text, **self.unstructured_kwargs)
20
+
21
+
22
+ if __name__ == "__main__":
23
+ loader = RapidOCRLoader(file_path="../tests/samples/ocr_test.jpg")
24
+ docs = loader.load()
25
+ print(docs)
document_loaders/mypdfloader.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from langchain.document_loaders.unstructured import UnstructuredFileLoader
3
+ from configs import PDF_OCR_THRESHOLD
4
+ from document_loaders.ocr import get_ocr
5
+ import tqdm
6
+
7
+
8
+ class RapidOCRPDFLoader(UnstructuredFileLoader):
9
+ def _get_elements(self) -> List:
10
+ def pdf2text(filepath):
11
+ import fitz # pyMuPDF里面的fitz包,不要与pip install fitz混淆
12
+ import numpy as np
13
+ ocr = get_ocr()
14
+ doc = fitz.open(filepath)
15
+ resp = ""
16
+
17
+ b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
18
+ for i, page in enumerate(doc):
19
+ b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
20
+ b_unit.refresh()
21
+ text = page.get_text("")
22
+ resp += text + "\n"
23
+
24
+ img_list = page.get_image_info(xrefs=True)
25
+ for img in img_list:
26
+ if xref := img.get("xref"):
27
+ bbox = img["bbox"]
28
+ # 检查图片尺寸是否超过设定的阈值
29
+ if ((bbox[2] - bbox[0]) / (page.rect.width) < PDF_OCR_THRESHOLD[0]
30
+ or (bbox[3] - bbox[1]) / (page.rect.height) < PDF_OCR_THRESHOLD[1]):
31
+ continue
32
+ pix = fitz.Pixmap(doc, xref)
33
+ img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
34
+ result, _ = ocr(img_array)
35
+ if result:
36
+ ocr_result = [line[1] for line in result]
37
+ resp += "\n".join(ocr_result)
38
+
39
+ # 更新进度
40
+ b_unit.update(1)
41
+ return resp
42
+
43
+ text = pdf2text(self.file_path)
44
+ from unstructured.partition.text import partition_text
45
+ return partition_text(text=text, **self.unstructured_kwargs)
46
+
47
+
48
+ if __name__ == "__main__":
49
+ loader = RapidOCRPDFLoader(file_path="../tests/samples/ocr_test.pdf")
50
+ docs = loader.load()
51
+ print(docs)
document_loaders/mypptloader.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders.unstructured import UnstructuredFileLoader
2
+ from typing import List
3
+ import tqdm
4
+
5
+
6
+ class RapidOCRPPTLoader(UnstructuredFileLoader):
7
+ def _get_elements(self) -> List:
8
+ def ppt2text(filepath):
9
+ from pptx import Presentation
10
+ from PIL import Image
11
+ import numpy as np
12
+ from io import BytesIO
13
+ from rapidocr_onnxruntime import RapidOCR
14
+ ocr = RapidOCR()
15
+ prs = Presentation(filepath)
16
+ resp = ""
17
+
18
+ def extract_text(shape):
19
+ nonlocal resp
20
+ if shape.has_text_frame:
21
+ resp += shape.text.strip() + "\n"
22
+ if shape.has_table:
23
+ for row in shape.table.rows:
24
+ for cell in row.cells:
25
+ for paragraph in cell.text_frame.paragraphs:
26
+ resp += paragraph.text.strip() + "\n"
27
+ if shape.shape_type == 13: # 13 表示图片
28
+ image = Image.open(BytesIO(shape.image.blob))
29
+ result, _ = ocr(np.array(image))
30
+ if result:
31
+ ocr_result = [line[1] for line in result]
32
+ resp += "\n".join(ocr_result)
33
+ elif shape.shape_type == 6: # 6 表示组合
34
+ for child_shape in shape.shapes:
35
+ extract_text(child_shape)
36
+
37
+ b_unit = tqdm.tqdm(total=len(prs.slides),
38
+ desc="RapidOCRPPTLoader slide index: 1")
39
+ # 遍历所有幻灯片
40
+ for slide_number, slide in enumerate(prs.slides, start=1):
41
+ b_unit.set_description(
42
+ "RapidOCRPPTLoader slide index: {}".format(slide_number))
43
+ b_unit.refresh()
44
+ sorted_shapes = sorted(slide.shapes,
45
+ key=lambda x: (x.top, x.left)) # 从上到下、从左到右遍历
46
+ for shape in sorted_shapes:
47
+ extract_text(shape)
48
+ b_unit.update(1)
49
+ return resp
50
+
51
+ text = ppt2text(self.file_path)
52
+ from unstructured.partition.text import partition_text
53
+ return partition_text(text=text, **self.unstructured_kwargs)
54
+
55
+
56
+ if __name__ == '__main__':
57
+ loader = RapidOCRPPTLoader(file_path="../tests/samples/ocr_test.pptx")
58
+ docs = loader.load()
59
+ print(docs)
document_loaders/ocr.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+
4
+ if TYPE_CHECKING:
5
+ try:
6
+ from rapidocr_paddle import RapidOCR
7
+ except ImportError:
8
+ from rapidocr_onnxruntime import RapidOCR
9
+
10
+
11
+ def get_ocr(use_cuda: bool = True) -> "RapidOCR":
12
+ try:
13
+ from rapidocr_paddle import RapidOCR
14
+ ocr = RapidOCR(det_use_cuda=use_cuda, cls_use_cuda=use_cuda, rec_use_cuda=use_cuda)
15
+ except ImportError:
16
+ from rapidocr_onnxruntime import RapidOCR
17
+ ocr = RapidOCR()
18
+ return ocr
embeddings/__init__.py ADDED
File without changes
embeddings/add_embedding_keywords.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ 该功能是为了将关键词加入到embedding模型中,以便于在embedding模型中进行关键词的embedding
3
+ 该功能的实现是通过修改embedding模型的tokenizer来实现的
4
+ 该功能仅仅对EMBEDDING_MODEL参数对应的的模型有效,输出后的模型保存在原本模型
5
+ 感谢@CharlesJu1和@charlesyju的贡献提出了想法和最基础的PR
6
+
7
+ 保存的模型的位置位于原本嵌入模型的目录下,模型的名称为原模型名称+Merge_Keywords_时间戳
8
+ '''
9
+ import sys
10
+
11
+ sys.path.append("..")
12
+ import os
13
+ import torch
14
+
15
+ from datetime import datetime
16
+ from configs import (
17
+ MODEL_PATH,
18
+ EMBEDDING_MODEL,
19
+ EMBEDDING_KEYWORD_FILE,
20
+ )
21
+
22
+ from safetensors.torch import save_model
23
+ from sentence_transformers import SentenceTransformer
24
+ from langchain_core._api import deprecated
25
+
26
+
27
+ @deprecated(
28
+ since="0.3.0",
29
+ message="自定义关键词 Langchain-Chatchat 0.3.x 重写, 0.2.x中相关功能将废弃",
30
+ removal="0.3.0"
31
+ )
32
+ def get_keyword_embedding(bert_model, tokenizer, key_words):
33
+ tokenizer_output = tokenizer(key_words, return_tensors="pt", padding=True, truncation=True)
34
+ input_ids = tokenizer_output['input_ids']
35
+ input_ids = input_ids[:, 1:-1]
36
+
37
+ keyword_embedding = bert_model.embeddings.word_embeddings(input_ids)
38
+ keyword_embedding = torch.mean(keyword_embedding, 1)
39
+ return keyword_embedding
40
+
41
+
42
+ def add_keyword_to_model(model_name=EMBEDDING_MODEL, keyword_file: str = "", output_model_path: str = None):
43
+ key_words = []
44
+ with open(keyword_file, "r") as f:
45
+ for line in f:
46
+ key_words.append(line.strip())
47
+
48
+ st_model = SentenceTransformer(model_name)
49
+ key_words_len = len(key_words)
50
+ word_embedding_model = st_model._first_module()
51
+ bert_model = word_embedding_model.auto_model
52
+ tokenizer = word_embedding_model.tokenizer
53
+ key_words_embedding = get_keyword_embedding(bert_model, tokenizer, key_words)
54
+
55
+ embedding_weight = bert_model.embeddings.word_embeddings.weight
56
+ embedding_weight_len = len(embedding_weight)
57
+ tokenizer.add_tokens(key_words)
58
+ bert_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32)
59
+ embedding_weight = bert_model.embeddings.word_embeddings.weight
60
+ with torch.no_grad():
61
+ embedding_weight[embedding_weight_len:embedding_weight_len + key_words_len, :] = key_words_embedding
62
+
63
+ if output_model_path:
64
+ os.makedirs(output_model_path, exist_ok=True)
65
+ word_embedding_model.save(output_model_path)
66
+ safetensors_file = os.path.join(output_model_path, "model.safetensors")
67
+ metadata = {'format': 'pt'}
68
+ save_model(bert_model, safetensors_file, metadata)
69
+ print("save model to {}".format(output_model_path))
70
+
71
+
72
+ def add_keyword_to_embedding_model(path: str = EMBEDDING_KEYWORD_FILE):
73
+ keyword_file = os.path.join(path)
74
+ model_name = MODEL_PATH["embed_model"][EMBEDDING_MODEL]
75
+ model_parent_directory = os.path.dirname(model_name)
76
+ current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
77
+ output_model_name = "{}_Merge_Keywords_{}".format(EMBEDDING_MODEL, current_time)
78
+ output_model_path = os.path.join(model_parent_directory, output_model_name)
79
+ add_keyword_to_model(model_name, keyword_file, output_model_path)
embeddings/embedding_keywords.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Langchain-Chatchat
2
+ 数据科学与大数据技术
3
+ 人工智能与先进计算
img/LLM_success.png ADDED
img/agent_continue.png ADDED
img/agent_success.png ADDED
img/chatchat-qrcode.jpg ADDED
img/chatchat_icon_blue_square_v2.png ADDED
img/docker_logs.png ADDED
img/fastapi_docs_026.png ADDED
img/init_knowledge_base.jpg ADDED
img/knowledge_base_success.jpg ADDED
img/langchain+chatglm.png ADDED

Git LFS Details

  • SHA256: 9ae4af8281129ba13033d172ce0556baf2c5f4b07f1bcf50ec233082266208b5
  • Pointer size: 132 Bytes
  • Size of remote file: 1.12 MB
img/langchain+chatglm2.png ADDED
img/logo-long-chatchat-trans-v2.png ADDED
img/official_account_qr.png ADDED
img/official_wechat_mp_account.png ADDED

Git LFS Details

  • SHA256: 021285c88e22bf0976c5188c5717466fa10af23ada09d4210ccf88bc8df7516c
  • Pointer size: 132 Bytes
  • Size of remote file: 4.27 MB
img/partners/autodl.svg ADDED
img/partners/aws.svg ADDED
img/partners/chatglm.svg ADDED
img/partners/zhenfund.svg ADDED
img/qr_code_86.jpg ADDED
img/qr_code_87.jpg ADDED
img/qr_code_88.jpg ADDED
knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-124076-270516.jpg ADDED
knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-20096-279847.jpg ADDED
knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-220157-552735.jpg ADDED
knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-36114-765327.jpg ADDED
knowledge_base/samples/content/llm/img/分布式训练技术原理-幕布图片-392521-261326.jpg ADDED

Git LFS Details

  • SHA256: 434aeea6c4491658ff7f7555060f708bd326d0ecf6fa62d7ca261a6ec845817a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.09 MB
knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-42284-124759.jpg ADDED
knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-57107-679259.jpg ADDED
knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-618350-869132.jpg ADDED
knowledge_base/samples/content/llm/img//345/210/206/345/270/203/345/274/217/350/256/255/347/273/203/346/212/200/346/234/257/345/216/237/347/220/206-/345/271/225/345/270/203/345/233/276/347/211/207-838373-426344.jpg ADDED