Update handler.py
Browse files- handler.py +40 -63
handler.py
CHANGED
@@ -2,7 +2,7 @@ import torch
|
|
2 |
import locale
|
3 |
import os
|
4 |
from typing import Dict, List, Any
|
5 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
6 |
from langchain.llms import HuggingFacePipeline
|
7 |
from langchain.retrievers.document_compressors import LLMChainExtractor
|
8 |
from langchain.retrievers import ContextualCompressionRetriever
|
@@ -26,46 +26,30 @@ from langchain.schema import format_document
|
|
26 |
from langchain.memory import ConversationBufferMemory
|
27 |
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
|
28 |
from langchain_core.runnables import RunnableParallel
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
class EndpointHandler():
|
31 |
-
def split_documents(
|
32 |
-
chunk_size: int,
|
33 |
-
knowledge_base: [],
|
34 |
-
tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
|
35 |
-
):
|
36 |
-
"""
|
37 |
-
Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
|
38 |
-
"""
|
39 |
-
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
40 |
-
AutoTokenizer.from_pretrained(tokenizer_name),
|
41 |
-
chunk_size=chunk_size,
|
42 |
-
chunk_overlap=int(chunk_size / 10),
|
43 |
-
add_start_index=True,
|
44 |
-
strip_whitespace=True,
|
45 |
-
separators=MARKDOWN_SEPARATORS,
|
46 |
-
)
|
47 |
-
|
48 |
-
docs_processed = []
|
49 |
-
for doc in knowledge_base:
|
50 |
-
docs_processed += text_splitter.split_documents([doc])
|
51 |
-
|
52 |
-
# Remove duplicates
|
53 |
-
unique_texts = {}
|
54 |
-
docs_processed_unique = []
|
55 |
-
for doc in docs_processed:
|
56 |
-
if doc.page_content not in unique_texts:
|
57 |
-
unique_texts[doc.page_content] = True
|
58 |
-
docs_processed_unique.append(doc)
|
59 |
-
|
60 |
-
return docs_processed_unique
|
61 |
-
|
62 |
-
def __init__(self, path=""):
|
63 |
|
|
|
64 |
# Config LangChain
|
65 |
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
66 |
os.environ["LANGCHAIN_API_KEY"] = "ls__9834e6b2ff094d43a28418c9ecea2fd5"
|
67 |
-
|
68 |
-
EMBEDDING_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
|
69 |
|
70 |
# Load Vector db
|
71 |
urls = [
|
@@ -77,34 +61,27 @@ class EndpointHandler():
|
|
77 |
|
78 |
loader = WebBaseLoader(urls)
|
79 |
docs = loader.load()
|
80 |
-
|
81 |
-
MARKDOWN_SEPARATORS = [
|
82 |
-
"\n#{1,6} ",
|
83 |
-
"```\n",
|
84 |
-
"\n\\*\\*\\*+\n",
|
85 |
-
"\n---+\n",
|
86 |
-
"\n___+\n",
|
87 |
-
"\n\n",
|
88 |
-
"\n",
|
89 |
-
" ",
|
90 |
-
"",
|
91 |
-
]
|
92 |
|
93 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
98 |
separators=MARKDOWN_SEPARATORS,
|
99 |
)
|
100 |
-
|
101 |
-
docs_processed = text_splitter.split_documents(docs)
|
102 |
|
103 |
-
docs_processed =
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
embedding_model = HuggingFaceEmbeddings(
|
110 |
model_name=EMBEDDING_MODEL_NAME,
|
@@ -114,7 +91,7 @@ class EndpointHandler():
|
|
114 |
)
|
115 |
|
116 |
self.vectorstore = FAISS.from_documents(
|
117 |
-
|
118 |
)
|
119 |
|
120 |
# Create LLM
|
@@ -132,7 +109,7 @@ class EndpointHandler():
|
|
132 |
# Testing
|
133 |
# tokenizer.pad_token = tokenizer.eos_token
|
134 |
|
135 |
-
READER_LLM = pipeline(
|
136 |
model=model,
|
137 |
tokenizer=tokenizer,
|
138 |
task="text-generation",
|
@@ -180,7 +157,7 @@ class EndpointHandler():
|
|
180 |
)
|
181 |
|
182 |
# Redact an answer
|
183 |
-
answer = READER_LLM(final_prompt)[0]["generated_text"]
|
184 |
|
185 |
return answer
|
186 |
|
|
|
2 |
import locale
|
3 |
import os
|
4 |
from typing import Dict, List, Any
|
5 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
|
6 |
from langchain.llms import HuggingFacePipeline
|
7 |
from langchain.retrievers.document_compressors import LLMChainExtractor
|
8 |
from langchain.retrievers import ContextualCompressionRetriever
|
|
|
26 |
from langchain.memory import ConversationBufferMemory
|
27 |
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
|
28 |
from langchain_core.runnables import RunnableParallel
|
29 |
+
from typing import Optional
|
30 |
+
from langchain.vectorstores import FAISS
|
31 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
32 |
+
from langchain_community.vectorstores.utils import DistanceStrategy
|
33 |
+
|
34 |
+
EMBEDDING_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
|
35 |
+
MARKDOWN_SEPARATORS = [
|
36 |
+
"\n#{1,6} ",
|
37 |
+
"```\n",
|
38 |
+
"\n\\*\\*\\*+\n",
|
39 |
+
"\n---+\n",
|
40 |
+
"\n___+\n",
|
41 |
+
"\n\n",
|
42 |
+
"\n",
|
43 |
+
" ",
|
44 |
+
"",
|
45 |
+
]
|
46 |
+
|
47 |
class EndpointHandler():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
def __init__(self, path=""):
|
50 |
# Config LangChain
|
51 |
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
52 |
os.environ["LANGCHAIN_API_KEY"] = "ls__9834e6b2ff094d43a28418c9ecea2fd5"
|
|
|
|
|
53 |
|
54 |
# Load Vector db
|
55 |
urls = [
|
|
|
61 |
|
62 |
loader = WebBaseLoader(urls)
|
63 |
docs = loader.load()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
+
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
66 |
+
AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME),
|
67 |
+
chunk_size=512,
|
68 |
+
chunk_overlap=int(512 / 10),
|
69 |
+
add_start_index=True,
|
70 |
+
strip_whitespace=True,
|
71 |
separators=MARKDOWN_SEPARATORS,
|
72 |
)
|
|
|
|
|
73 |
|
74 |
+
docs_processed = []
|
75 |
+
for doc in docs:
|
76 |
+
docs_processed += text_splitter.split_documents([doc])
|
77 |
+
|
78 |
+
# Remove duplicates
|
79 |
+
unique_texts = {}
|
80 |
+
docs_processed_unique = []
|
81 |
+
for doc in docs_processed:
|
82 |
+
if doc.page_content not in unique_texts:
|
83 |
+
unique_texts[doc.page_content] = True
|
84 |
+
docs_processed_unique.append(doc)
|
85 |
|
86 |
embedding_model = HuggingFaceEmbeddings(
|
87 |
model_name=EMBEDDING_MODEL_NAME,
|
|
|
91 |
)
|
92 |
|
93 |
self.vectorstore = FAISS.from_documents(
|
94 |
+
docs_processed_unique, embedding_model, distance_strategy=DistanceStrategy.COSINE
|
95 |
)
|
96 |
|
97 |
# Create LLM
|
|
|
109 |
# Testing
|
110 |
# tokenizer.pad_token = tokenizer.eos_token
|
111 |
|
112 |
+
self.READER_LLM = pipeline(
|
113 |
model=model,
|
114 |
tokenizer=tokenizer,
|
115 |
task="text-generation",
|
|
|
157 |
)
|
158 |
|
159 |
# Redact an answer
|
160 |
+
answer = self.READER_LLM(final_prompt)[0]["generated_text"]
|
161 |
|
162 |
return answer
|
163 |
|