myconsicouness / dataloader.py
adityasugandhi
,,
919910a
raw
history blame
4.7 kB
from pathlib import Path
import os
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder,SentenceTransformersTextEmbedder
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from haystack.document_stores.in_memory import InMemoryDocumentStore
class DataLoader:
def __init__(self):
self.chroma_store = ChromaDocumentStore()
self.InMemory_store = InMemoryDocumentStore()
def dataloader(self):
HERE = Path(os.getcwd())
data_path = HERE / "data"
file_paths = [str(data_path / name) for name in os.listdir(data_path)]
pipeline = Pipeline()
pipeline.add_component("FileTypeRouter", FileTypeRouter(mime_types=["text/plain", "application/pdf"]))
pipeline.add_component("TextFileConverter", TextFileToDocument())
pipeline.add_component("PdfFileConverter", PyPDFToDocument())
pipeline.add_component("Joiner", DocumentJoiner())
pipeline.add_component("Cleaner", DocumentCleaner())
pipeline.add_component("Splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
# pipeline.add_component("TextEmbedder", SentenceTransformersTextEmbedder())
pipeline.add_component("Embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
pipeline.add_component("Writer", DocumentWriter(document_store=self.chroma_store))
pipeline.connect("FileTypeRouter.text/plain", "TextFileConverter.sources")
pipeline.connect("FileTypeRouter.application/pdf", "PdfFileConverter.sources")
pipeline.connect("TextFileConverter.documents", "Joiner.documents")
pipeline.connect("PdfFileConverter.documents", "Joiner.documents")
pipeline.connect("Joiner.documents", "Cleaner.documents")
pipeline.connect("Cleaner.documents", "Splitter.documents")
pipeline.connect("Splitter.documents", "Embedder.documents")
# pipeline.connect("TextEmbedder.embeddings", "Embedder.documents")
pipeline.connect("Embedder.documents", "Writer.documents")
pipeline.run(
{"FileTypeRouter": {"sources": file_paths}},
)
return self.chroma_store
def InMemory_dataloader(self):
HERE = Path(os.getcwd())
data_path = HERE / "data"
file_paths = [str(data_path / name) for name in os.listdir(data_path)]
pipeline = Pipeline()
pipeline.add_component("FileTypeRouter", FileTypeRouter(mime_types=["text/plain", "application/pdf"]))
pipeline.add_component("TextFileConverter", TextFileToDocument())
pipeline.add_component("PdfFileConverter", PyPDFToDocument())
pipeline.add_component("Joiner", DocumentJoiner())
pipeline.add_component("Cleaner", DocumentCleaner())
pipeline.add_component("Splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
# pipeline.add_component("TextEmbedder", SentenceTransformersTextEmbedder())
pipeline.add_component("Embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
pipeline.add_component("Writer", DocumentWriter(document_store=self.InMemory_store))
pipeline.connect("FileTypeRouter.text/plain", "TextFileConverter.sources")
pipeline.connect("FileTypeRouter.application/pdf", "PdfFileConverter.sources")
pipeline.connect("TextFileConverter.documents", "Joiner.documents")
pipeline.connect("PdfFileConverter.documents", "Joiner.documents")
pipeline.connect("Joiner.documents", "Cleaner.documents")
pipeline.connect("Cleaner.documents", "Splitter.documents")
pipeline.connect("Splitter.documents", "Embedder.documents")
# pipeline.connect("TextEmbedder.embeddings", "Embedder.documents")
pipeline.connect("Embedder.documents", "Writer.documents")
pipeline.run(
{"FileTypeRouter": {"sources": file_paths}},
)
return self.InMemory_store
def get_chroma_store(self):
return self.chroma_store
def get_InMemory_store(self):
return self.InMemory_store