File size: 4,704 Bytes
919910a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from pathlib import Path
import os
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder,SentenceTransformersTextEmbedder
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from haystack.document_stores.in_memory import InMemoryDocumentStore
class DataLoader:
    
    def __init__(self):
        self.chroma_store = ChromaDocumentStore()
        self.InMemory_store = InMemoryDocumentStore()
    
    def dataloader(self):
        HERE = Path(os.getcwd())


        data_path = HERE / "data"
        file_paths = [str(data_path / name) for name in os.listdir(data_path)]

        

        pipeline = Pipeline()
        pipeline.add_component("FileTypeRouter", FileTypeRouter(mime_types=["text/plain", "application/pdf"]))
        pipeline.add_component("TextFileConverter", TextFileToDocument())
        pipeline.add_component("PdfFileConverter", PyPDFToDocument())

        pipeline.add_component("Joiner", DocumentJoiner())
        pipeline.add_component("Cleaner", DocumentCleaner())
        pipeline.add_component("Splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
        # pipeline.add_component("TextEmbedder", SentenceTransformersTextEmbedder())
        pipeline.add_component("Embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))

        pipeline.add_component("Writer", DocumentWriter(document_store=self.chroma_store))

        pipeline.connect("FileTypeRouter.text/plain", "TextFileConverter.sources")
        pipeline.connect("FileTypeRouter.application/pdf", "PdfFileConverter.sources")
        pipeline.connect("TextFileConverter.documents", "Joiner.documents")
        pipeline.connect("PdfFileConverter.documents", "Joiner.documents")
        pipeline.connect("Joiner.documents", "Cleaner.documents")
        pipeline.connect("Cleaner.documents", "Splitter.documents")
        pipeline.connect("Splitter.documents", "Embedder.documents")
        # pipeline.connect("TextEmbedder.embeddings", "Embedder.documents")
        pipeline.connect("Embedder.documents", "Writer.documents")



        pipeline.run(
            {"FileTypeRouter": {"sources": file_paths}},
         
        )
        return self.chroma_store
    
    
    def InMemory_dataloader(self):
        HERE = Path(os.getcwd())


        data_path = HERE / "data"
        file_paths = [str(data_path / name) for name in os.listdir(data_path)]

        

        pipeline = Pipeline()
        pipeline.add_component("FileTypeRouter", FileTypeRouter(mime_types=["text/plain", "application/pdf"]))
        pipeline.add_component("TextFileConverter", TextFileToDocument())
        pipeline.add_component("PdfFileConverter", PyPDFToDocument())

        pipeline.add_component("Joiner", DocumentJoiner())
        pipeline.add_component("Cleaner", DocumentCleaner())
        pipeline.add_component("Splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
        # pipeline.add_component("TextEmbedder", SentenceTransformersTextEmbedder())
        pipeline.add_component("Embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))

        pipeline.add_component("Writer", DocumentWriter(document_store=self.InMemory_store))

        pipeline.connect("FileTypeRouter.text/plain", "TextFileConverter.sources")
        pipeline.connect("FileTypeRouter.application/pdf", "PdfFileConverter.sources")
        pipeline.connect("TextFileConverter.documents", "Joiner.documents")
        pipeline.connect("PdfFileConverter.documents", "Joiner.documents")
        pipeline.connect("Joiner.documents", "Cleaner.documents")
        pipeline.connect("Cleaner.documents", "Splitter.documents")
        pipeline.connect("Splitter.documents", "Embedder.documents")
        # pipeline.connect("TextEmbedder.embeddings", "Embedder.documents")
        pipeline.connect("Embedder.documents", "Writer.documents")



        pipeline.run(
            {"FileTypeRouter": {"sources": file_paths}},
         
        )
        return self.InMemory_store
    
    
    def get_chroma_store(self):
        return self.chroma_store
    
    def get_InMemory_store(self):
        return self.InMemory_store