import os # from dotenv import load_dotenv from chromadb.config import Settings # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader # load_dotenv() ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) # Define the folder for storing database SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS" PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/DB" # Can be changed to a specific number INGEST_THREADS = os.cpu_count() or 8 # Define the Chroma settings CHROMA_SETTINGS = Settings( chroma_db_impl="duckdb+parquet", persist_directory=PERSIST_DIRECTORY, anonymized_telemetry=False ) # https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader DOCUMENT_MAP = { ".txt": TextLoader, ".md": TextLoader, ".py": TextLoader, ".pdf": PDFMinerLoader, ".csv": CSVLoader, ".xls": UnstructuredExcelLoader, ".xlsx": UnstructuredExcelLoader, ".docx": Docx2txtLoader, ".doc": Docx2txtLoader, } # Default Instructor Model EMBEDDING_MODEL_NAME = "hkunlp/instructor-large" # You can also choose a smaller model, don't forget to change HuggingFaceInstructEmbeddings # to HuggingFaceEmbeddings in both ingest.py and run_localGPT.py # EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"