File size: 4,543 Bytes
b2e325f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

''' Start: 以下加载本地知识的核心内容。'''
import langchain
import os
# from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
# # from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS
from langchain_community.vectorstores import FAISS

# ## 加载单个文件
# filepath = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
# filepath = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/大模型LLM解决方案调研问卷.pdf"
# filepath = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/gpt-index-readthedocs-io-en-latest.pdf"
# # # filepath = "/Users/yunshi/Downloads/txt_dir/浙江省院前急救质控统计指标.pdf"
# loader = UnstructuredFileLoader(filepath)
# docs = loader.load() ## 注意后面在应用多文件时,变量名是documents,不是docs。


def localKB_construct(fileDirectory):

    ### 以下是加载多个文档的方式。
    # fileDirectory = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/RAG/rawdata/PDF/"
    # from langchain.document_loaders import Docx2txtLoader
    from langchain_community.document_loaders import Docx2txtLoader
    # from langchain.document_loaders import TextLoader
    from langchain_community.document_loaders import TextLoader
    from langchain_community.document_loaders import PyPDFLoader
    # from langchain.document_loaders import PyPDFLoader
    documents = []
    for file in os.listdir(fileDirectory):
        if file.endswith('.pdf'):
            pdf_path = fileDirectory + file
            loader = PyPDFLoader(pdf_path)
            # loader = PdfReader(pdf_path)
            documents.extend(loader.load())
        elif file.endswith('.docx') or file.endswith('.doc'):
            doc_path = fileDirectory + file
            loader = Docx2txtLoader(doc_path)
            documents.extend(loader.load())
        elif file.endswith('.txt'):
            text_path = fileDirectory + file
            loader = TextLoader(text_path)
            documents.extend(loader.load())

    print("length of all documents:", len(documents))
    # documents = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    # chunked_documents = text_splitter.split_documents(documents)

    # ## 文本分割
    # text_splitter = CharacterTextSplitter(chunk_size=5000, chunk_overlap=200)
    docs = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20).split_documents(documents) #NOTE: 不易chunk_size切割太大,超过模型的最大长度max tokens。

    # ## 创建向量数据库
    # # embeddings = OpenAIEmbeddings(disallowed_special=())
    # embedding_model_name = 'GanymedeNil/text2vec-large-chinese'
    # # embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) ## 这里是联网情况下连接huggingface后使用。
    embeddings = HuggingFaceEmbeddings(model_name='/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/RAG/bge-large-zh/') ## 切换成BGE的embedding。
    # embeddings = HuggingFaceEmbeddings(model_name='/Users/yunshi/Downloads/chatGLM/My_LocalKB_Project/GanymedeNil_text2vec-large-chinese/') ## 这里会有个“No sentence-transformers model found with name“的warning,但不是error,不影响使用。

    ### 中文embeddding之一。
    # from text2vec import SentenceModel
    # embeddings = SentenceModel('shibing624/text2vec-base-chinese-sentence', device=mps_device)
    # # embeddings = HuggingFaceEmbeddings()
    vector_store = FAISS.from_documents(docs, embeddings)
    vector_store.save_local('./FAISS/')
    print('vector_store construction complete:', vector_store)
    return vector_store

# vs = localKB_construct(fileDirectory = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/RAG/rawdata/PDF/")

if __name__ == '__main__':
    # localKB_construct(input("请输入本地文件夹路径:")) ##导入文件夹路径,最后需要加上“/”。
    localKB_construct("/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/gradio/中交建/产品演示DEMO/交付_简易知识库查询系统(含基座)/KB/") ### 这里是本地文件夹路径,用于构建本地知识库。