llm_knowledge_base / langchain_KB_construct.py
allinaigc's picture
Upload 35 files
b2e325f verified
raw
history blame
No virus
4.54 kB
''' Start: 以下加载本地知识的核心内容。'''
import langchain
import os
# from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
# # from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS
from langchain_community.vectorstores import FAISS
# ## 加载单个文件
# filepath = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
# filepath = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/大模型LLM解决方案调研问卷.pdf"
# filepath = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/gpt-index-readthedocs-io-en-latest.pdf"
# # # filepath = "/Users/yunshi/Downloads/txt_dir/浙江省院前急救质控统计指标.pdf"
# loader = UnstructuredFileLoader(filepath)
# docs = loader.load() ## 注意后面在应用多文件时,变量名是documents,不是docs。
def localKB_construct(fileDirectory):
### 以下是加载多个文档的方式。
# fileDirectory = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/RAG/rawdata/PDF/"
# from langchain.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import Docx2txtLoader
# from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
# from langchain.document_loaders import PyPDFLoader
documents = []
for file in os.listdir(fileDirectory):
if file.endswith('.pdf'):
pdf_path = fileDirectory + file
loader = PyPDFLoader(pdf_path)
# loader = PdfReader(pdf_path)
documents.extend(loader.load())
elif file.endswith('.docx') or file.endswith('.doc'):
doc_path = fileDirectory + file
loader = Docx2txtLoader(doc_path)
documents.extend(loader.load())
elif file.endswith('.txt'):
text_path = fileDirectory + file
loader = TextLoader(text_path)
documents.extend(loader.load())
print("length of all documents:", len(documents))
# documents = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
# chunked_documents = text_splitter.split_documents(documents)
# ## 文本分割
# text_splitter = CharacterTextSplitter(chunk_size=5000, chunk_overlap=200)
docs = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20).split_documents(documents) #NOTE: 不易chunk_size切割太大,超过模型的最大长度max tokens。
# ## 创建向量数据库
# # embeddings = OpenAIEmbeddings(disallowed_special=())
# embedding_model_name = 'GanymedeNil/text2vec-large-chinese'
# # embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) ## 这里是联网情况下连接huggingface后使用。
embeddings = HuggingFaceEmbeddings(model_name='/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/RAG/bge-large-zh/') ## 切换成BGE的embedding。
# embeddings = HuggingFaceEmbeddings(model_name='/Users/yunshi/Downloads/chatGLM/My_LocalKB_Project/GanymedeNil_text2vec-large-chinese/') ## 这里会有个“No sentence-transformers model found with name“的warning,但不是error,不影响使用。
### 中文embeddding之一。
# from text2vec import SentenceModel
# embeddings = SentenceModel('shibing624/text2vec-base-chinese-sentence', device=mps_device)
# # embeddings = HuggingFaceEmbeddings()
vector_store = FAISS.from_documents(docs, embeddings)
vector_store.save_local('./FAISS/')
print('vector_store construction complete:', vector_store)
return vector_store
# vs = localKB_construct(fileDirectory = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/RAG/rawdata/PDF/")
if __name__ == '__main__':
# localKB_construct(input("请输入本地文件夹路径:")) ##导入文件夹路径,最后需要加上“/”。
localKB_construct("/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/gradio/中交建/产品演示DEMO/交付_简易知识库查询系统(含基座)/KB/") ### 这里是本地文件夹路径,用于构建本地知识库。