from haystack.components.embedders import SentenceTransformersDocumentEmbedder from haystack import Document from haystack.components.preprocessors import DocumentSplitter from haystack.document_stores.types import DuplicatePolicy from haystack_integrations.document_stores.chroma import ChromaDocumentStore from haystack.components.converters import PyPDFToDocument from dotenv import load_dotenv import os from pathlib import Path load_dotenv(dotenv_path="../") def read_text_files(directory): text_contents = [] # This list will store the content of each text file for filename in os.listdir(directory): if filename.endswith(".txt"): # Checks if the file is a text file filepath = os.path.join(directory, filename) with open(filepath, "r", encoding="utf-8") as file: text_contents.append( file.read() ) # Adds the content of the file to the list return text_contents def build_store(path="chromadb"): document_store = ChromaDocumentStore(persist_path=path) return document_store def write_docs(docs, document_store): doc_embedder = SentenceTransformersDocumentEmbedder( model="intfloat/multilingual-e5-small" ) doc_embedder.warm_up() # Download model docs_with_embeddings = doc_embedder.run(docs) print("embedded") document_store.write_documents( docs_with_embeddings["documents"], policy=DuplicatePolicy.OVERWRITE ) print("written") return document_store def write_wiki(directory_path, document_store): texts = read_text_files(directory_path) splitter = DocumentSplitter(split_by="word", split_length=300, split_overlap=0) docs = [Document(content=doc, meta={"source": "wiki"}) for doc in texts] docs = splitter.run(docs)["documents"] return write_docs(docs, document_store) def write_chronik(file_path, document_store): converter = PyPDFToDocument() docs = converter.run(sources=[Path(file_path)], meta={"source": "chronik"}) print("converted") splitter = DocumentSplitter(split_by="word", split_length=300, split_overlap=0) print("splitted") docs = splitter.run(docs["documents"])["documents"] return write_docs(docs, document_store) if __name__ == "__main__": doc_store = build_store("chromadb") doc_store = write_chronik("../processing/chronik/kreisbuch_emsland.pdf", doc_store) doc = write_wiki("../processing/wiki/data", doc_store) print("Hey")