lchakkei commited on
Commit
c3c8d59
1 Parent(s): 541a2db

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +40 -63
handler.py CHANGED
@@ -2,7 +2,7 @@ import torch
2
  import locale
3
  import os
4
  from typing import Dict, List, Any
5
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
6
  from langchain.llms import HuggingFacePipeline
7
  from langchain.retrievers.document_compressors import LLMChainExtractor
8
  from langchain.retrievers import ContextualCompressionRetriever
@@ -26,46 +26,30 @@ from langchain.schema import format_document
26
  from langchain.memory import ConversationBufferMemory
27
  from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
28
  from langchain_core.runnables import RunnableParallel
29
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  class EndpointHandler():
31
- def split_documents(
32
- chunk_size: int,
33
- knowledge_base: [],
34
- tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
35
- ):
36
- """
37
- Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
38
- """
39
- text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
40
- AutoTokenizer.from_pretrained(tokenizer_name),
41
- chunk_size=chunk_size,
42
- chunk_overlap=int(chunk_size / 10),
43
- add_start_index=True,
44
- strip_whitespace=True,
45
- separators=MARKDOWN_SEPARATORS,
46
- )
47
-
48
- docs_processed = []
49
- for doc in knowledge_base:
50
- docs_processed += text_splitter.split_documents([doc])
51
-
52
- # Remove duplicates
53
- unique_texts = {}
54
- docs_processed_unique = []
55
- for doc in docs_processed:
56
- if doc.page_content not in unique_texts:
57
- unique_texts[doc.page_content] = True
58
- docs_processed_unique.append(doc)
59
-
60
- return docs_processed_unique
61
-
62
- def __init__(self, path=""):
63
 
 
64
  # Config LangChain
65
  os.environ["LANGCHAIN_TRACING_V2"] = "true"
66
  os.environ["LANGCHAIN_API_KEY"] = "ls__9834e6b2ff094d43a28418c9ecea2fd5"
67
-
68
- EMBEDDING_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
69
 
70
  # Load Vector db
71
  urls = [
@@ -77,34 +61,27 @@ class EndpointHandler():
77
 
78
  loader = WebBaseLoader(urls)
79
  docs = loader.load()
80
-
81
- MARKDOWN_SEPARATORS = [
82
- "\n#{1,6} ",
83
- "```\n",
84
- "\n\\*\\*\\*+\n",
85
- "\n---+\n",
86
- "\n___+\n",
87
- "\n\n",
88
- "\n",
89
- " ",
90
- "",
91
- ]
92
 
93
- text_splitter = RecursiveCharacterTextSplitter(
94
- chunk_size=1000, # the maximum number of characters in a chunk: we selected this value arbitrarily
95
- chunk_overlap=100, # the number of characters to overlap between chunks
96
- add_start_index=True, # If `True`, includes chunk's start index in metadata
97
- strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
 
98
  separators=MARKDOWN_SEPARATORS,
99
  )
100
-
101
- docs_processed = text_splitter.split_documents(docs)
102
 
103
- docs_processed = split_documents(
104
- 512, # We choose a chunk size adapted to our model
105
- docs,
106
- tokenizer_name=EMBEDDING_MODEL_NAME,
107
- )
 
 
 
 
 
 
108
 
109
  embedding_model = HuggingFaceEmbeddings(
110
  model_name=EMBEDDING_MODEL_NAME,
@@ -114,7 +91,7 @@ class EndpointHandler():
114
  )
115
 
116
  self.vectorstore = FAISS.from_documents(
117
- docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
118
  )
119
 
120
  # Create LLM
@@ -132,7 +109,7 @@ class EndpointHandler():
132
  # Testing
133
  # tokenizer.pad_token = tokenizer.eos_token
134
 
135
- READER_LLM = pipeline(
136
  model=model,
137
  tokenizer=tokenizer,
138
  task="text-generation",
@@ -180,7 +157,7 @@ class EndpointHandler():
180
  )
181
 
182
  # Redact an answer
183
- answer = READER_LLM(final_prompt)[0]["generated_text"]
184
 
185
  return answer
186
 
 
2
  import locale
3
  import os
4
  from typing import Dict, List, Any
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
6
  from langchain.llms import HuggingFacePipeline
7
  from langchain.retrievers.document_compressors import LLMChainExtractor
8
  from langchain.retrievers import ContextualCompressionRetriever
 
26
  from langchain.memory import ConversationBufferMemory
27
  from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
28
  from langchain_core.runnables import RunnableParallel
29
+ from typing import Optional
30
+ from langchain.vectorstores import FAISS
31
+ from langchain_community.embeddings import HuggingFaceEmbeddings
32
+ from langchain_community.vectorstores.utils import DistanceStrategy
33
+
34
+ EMBEDDING_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
35
+ MARKDOWN_SEPARATORS = [
36
+ "\n#{1,6} ",
37
+ "```\n",
38
+ "\n\\*\\*\\*+\n",
39
+ "\n---+\n",
40
+ "\n___+\n",
41
+ "\n\n",
42
+ "\n",
43
+ " ",
44
+ "",
45
+ ]
46
+
47
  class EndpointHandler():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ def __init__(self, path=""):
50
  # Config LangChain
51
  os.environ["LANGCHAIN_TRACING_V2"] = "true"
52
  os.environ["LANGCHAIN_API_KEY"] = "ls__9834e6b2ff094d43a28418c9ecea2fd5"
 
 
53
 
54
  # Load Vector db
55
  urls = [
 
61
 
62
  loader = WebBaseLoader(urls)
63
  docs = loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
66
+ AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME),
67
+ chunk_size=512,
68
+ chunk_overlap=int(512 / 10),
69
+ add_start_index=True,
70
+ strip_whitespace=True,
71
  separators=MARKDOWN_SEPARATORS,
72
  )
 
 
73
 
74
+ docs_processed = []
75
+ for doc in docs:
76
+ docs_processed += text_splitter.split_documents([doc])
77
+
78
+ # Remove duplicates
79
+ unique_texts = {}
80
+ docs_processed_unique = []
81
+ for doc in docs_processed:
82
+ if doc.page_content not in unique_texts:
83
+ unique_texts[doc.page_content] = True
84
+ docs_processed_unique.append(doc)
85
 
86
  embedding_model = HuggingFaceEmbeddings(
87
  model_name=EMBEDDING_MODEL_NAME,
 
91
  )
92
 
93
  self.vectorstore = FAISS.from_documents(
94
+ docs_processed_unique, embedding_model, distance_strategy=DistanceStrategy.COSINE
95
  )
96
 
97
  # Create LLM
 
109
  # Testing
110
  # tokenizer.pad_token = tokenizer.eos_token
111
 
112
+ self.READER_LLM = pipeline(
113
  model=model,
114
  tokenizer=tokenizer,
115
  task="text-generation",
 
157
  )
158
 
159
  # Redact an answer
160
+ answer = self.READER_LLM(final_prompt)[0]["generated_text"]
161
 
162
  return answer
163