Spaces:
Running
Running
File size: 1,119 Bytes
7668644 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import pysqlite3
import sys, os
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import Chroma
import streamlit as st
HF_TOKEN = st.secrets["HF_TOKEN"]
def persist_dir(file_path):
data = PyPDFLoader(file_path)
print("Loading data...")
content = data.load()
print("Splitting data...")
splitter = RecursiveCharacterTextSplitter(chunk_size=1024,chunk_overlap=150)
chunks = splitter.split_documents(content)
embeddings = HuggingFaceInferenceAPIEmbeddings(
api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5"
)
print("Save to db...")
vectorstore = Chroma.from_documents(chunks, embeddings,persist_directory="./db")
if __name__ == "__main__":
#will change, if you add file upload on streamlit
#data = "./data/Sungwon_Kim_ML_DL.pdf"
data = "./data/Sungwon_Kim_ML_DL_Intro_together.pdf"
persist_dir(data)
|