File size: 1,119 Bytes
7668644
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import pysqlite3
import sys, os
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import Chroma
import streamlit as st

HF_TOKEN = st.secrets["HF_TOKEN"]

def persist_dir(file_path):
    data = PyPDFLoader(file_path)
    print("Loading data...")
    content = data.load()
    print("Splitting data...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=1024,chunk_overlap=150)
    chunks = splitter.split_documents(content)
    embeddings = HuggingFaceInferenceAPIEmbeddings(
        api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5"
    )
    print("Save to db...")
    vectorstore = Chroma.from_documents(chunks, embeddings,persist_directory="./db")
    
if __name__ == "__main__":
    #will change, if you add file upload on streamlit
    #data = "./data/Sungwon_Kim_ML_DL.pdf" 
    data = "./data/Sungwon_Kim_ML_DL_Intro_together.pdf" 
    persist_dir(data)