File size: 3,809 Bytes
39b06ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eaa707f
 
 
39b06ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from langchain_community.document_loaders import PyMuPDFLoader

def load_pdfs(paths: list) -> list:

    # List of file paths for the PDFs you want to load
    paths = paths

    # Create a list to store loaded documents
    documents = []

    # Loop through each PDF and load it
    for path in paths:
        loader = PyMuPDFLoader(path)
        documents.extend(loader.load())  # Add the documents to the list

    return documents 

#####

from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_docs_recursive(documents: list, chunk_size: int, chunk_overlap: int) -> list:

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    chunks = text_splitter.split_documents(documents)

    return chunks

#####

from langchain.text_splitter import NLTKTextSplitter
import nltk

nltk.download('punkt_tab')

def chunk_docs_nltk(documents: list, chunk_size: int, chunk_overlap: int) -> list:

    text_splitter = NLTKTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap)

    chunks = text_splitter.split_documents(documents)

    return chunks

#####

# from langchain_openai import OpenAIEmbeddings

# def create_embeddings_openai(model: str) -> OpenAIEmbeddings:

#     # Initialize the OpenAIEmbeddings class
#     embeddings = OpenAIEmbeddings(model=model)

#     return embeddings

#####

from langchain_huggingface import HuggingFaceEmbeddings

def create_embeddings_opensource(model: str) -> HuggingFaceEmbeddings:

    # Initialize the OpenAIEmbeddings class
    embeddings = HuggingFaceEmbeddings(model_name=model)

    return embeddings

#####

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

def create_vector_store(location: str, collection_name: str, vector_size: int, embeddings, documents: list) -> QdrantVectorStore:

    # Initialize the Qdrant client
    qdrant_client = QdrantClient(
        location=location
        )

    # Create a collection in Qdrant
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=vector_size, 
            distance=Distance.COSINE
            )
        )

        # Initialize QdrantVectorStore with the Qdrant client
    qdrant_vector_store = QdrantVectorStore(
            client=qdrant_client,
            collection_name=collection_name,
            embedding=embeddings,
        )
    
    qdrant_vector_store.add_documents(documents)
    
    return qdrant_vector_store

#####

def create_retriever_from_qdrant(vector_store: QdrantVectorStore):
  retriever = vector_store.as_retriever()

  return retriever

#####

from langchain.prompts import ChatPromptTemplate

def create_chat_prompt_template() -> ChatPromptTemplate:
    template = """
    Only answer the question using the context below.  If the answer can't be found in the context, respond "I don't know". 

    Question:
    {question}

    Context:
    {context}
    """
    prompt = ChatPromptTemplate.from_template(template)

    return prompt

#####

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from operator import itemgetter

def create_chain_openai(model: str, prompt: ChatPromptTemplate, retriever):

    llm = ChatOpenAI(
        model_name="gpt-4o-mini", 
        temperature=0
        )

    chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")} 
        | RunnablePassthrough.assign(context=itemgetter("context")) 
        | {"response": prompt | llm, "context": itemgetter("context")}
        )

    return chain

#####