Tonic commited on
Commit
70c5bc9
1 Parent(s): a4669e2

chroma langchain fix 1

Browse files
Files changed (2) hide show
  1. app.py +35 -16
  2. requirements.txt +1 -0
app.py CHANGED
@@ -113,9 +113,13 @@ from langchain_community.document_loaders import UnstructuredFileLoader
113
  from chromadb import Documents, EmbeddingFunction, Embeddings
114
  from chromadb.config import Settings
115
  from chromadb import HttpClient
 
116
  from utils import load_env_variables, parse_and_route
117
  from globalvars import API_BASE, intention_prompt, tasks, system_message, model_name
 
 
118
 
 
119
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30'
120
  os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
121
  os.environ['CUDA_CACHE_DISABLE'] = '1'
@@ -178,6 +182,23 @@ class EmbeddingGenerator:
178
  self.clear_cuda_cache()
179
  return embeddings_list
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  class MyEmbeddingFunction(EmbeddingFunction):
182
  def __init__(self, embedding_generator: EmbeddingGenerator):
183
  self.embedding_generator = embedding_generator
@@ -193,25 +214,22 @@ def load_documents(file_path: str, mode: str = "elements"):
193
  return [doc.page_content for doc in docs]
194
 
195
  def initialize_chroma(collection_name: str, embedding_function: MyEmbeddingFunction):
196
- client = chromadb.HttpClient(host='localhost', port=8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))
197
- client.reset() # resets the database
198
- collection = client.create_collection(collection_name)
199
- return client, collection
200
 
201
- def add_documents_to_chroma(client, collection, documents: list, embedding_function: MyEmbeddingFunction):
202
  for doc in documents:
203
- collection.add(ids=[str(uuid.uuid1())], documents=[doc], embeddings=embedding_function([doc]))
204
 
205
- def query_chroma(client, collection_name: str, query_text: str, embedding_function: MyEmbeddingFunction):
206
- db = Chroma(client=client, collection_name=collection_name, embedding_function=embedding_function)
207
- result_docs = db.similarity_search(query_text)
208
  return result_docs
209
-
210
  # Initialize clients
211
  intention_client = OpenAI(api_key=yi_token, base_url=API_BASE)
212
  embedding_generator = EmbeddingGenerator(model_name=model_name, token=hf_token, intention_client=intention_client)
213
  embedding_function = MyEmbeddingFunction(embedding_generator=embedding_generator)
214
- chroma_client, chroma_collection = initialize_chroma(collection_name="Tonic-instruct", embedding_function=embedding_function)
215
 
216
  def respond(
217
  message,
@@ -243,14 +261,15 @@ def respond(
243
 
244
  def upload_documents(files):
245
  for file in files:
246
- loader = DocumentLoader(file.name)
247
- documents = loader.load_documents()
248
- chroma_manager.add_documents(documents)
249
  return "Documents uploaded and processed successfully!"
250
 
 
251
  def query_documents(query):
252
- results = chroma_manager.query(query)
253
- return "\n\n".join([result.content for result in results])
254
 
255
  with gr.Blocks() as demo:
256
  with gr.Tab("Upload Documents"):
 
113
  from chromadb import Documents, EmbeddingFunction, Embeddings
114
  from chromadb.config import Settings
115
  from chromadb import HttpClient
116
+ from langchain_chroma import Chroma
117
  from utils import load_env_variables, parse_and_route
118
  from globalvars import API_BASE, intention_prompt, tasks, system_message, model_name
119
+ from langchain_core.embeddings import Embeddings
120
+ from chromadb.api.types import EmbeddingFunction, Documents
121
 
122
+
123
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30'
124
  os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
125
  os.environ['CUDA_CACHE_DISABLE'] = '1'
 
182
  self.clear_cuda_cache()
183
  return embeddings_list
184
 
185
+ class ChromaEmbeddingsAdapter(Embeddings):
186
+ def __init__(self, ef: EmbeddingFunction):
187
+ self.ef = ef
188
+
189
+ def embed_documents(self, texts):
190
+ return self.ef(texts)
191
+
192
+ def embed_query(self, query):
193
+ return self.ef([query])[0]
194
+
195
+ class LangChainEmbeddingAdapter(EmbeddingFunction[Documents]):
196
+ def __init__(self, ef: Embeddings):
197
+ self.ef = ef
198
+
199
+ def __call__(self, input: Documents) -> Embeddings:
200
+ return self.ef.embed_documents(input)
201
+
202
  class MyEmbeddingFunction(EmbeddingFunction):
203
  def __init__(self, embedding_generator: EmbeddingGenerator):
204
  self.embedding_generator = embedding_generator
 
214
  return [doc.page_content for doc in docs]
215
 
216
  def initialize_chroma(collection_name: str, embedding_function: MyEmbeddingFunction):
217
+ client = Chroma.from_documents([], ChromaEmbeddingsAdapter(embedding_function)) # Initialize with no documents
218
+ return client
 
 
219
 
220
+ def add_documents_to_chroma(client, documents: list, embedding_function: MyEmbeddingFunction):
221
  for doc in documents:
222
+ client.add(ids=[str(uuid.uuid1())], documents=[doc], embeddings=embedding_function([doc]))
223
 
224
+ def query_chroma(client, query_text: str):
225
+ result_docs = client.similarity_search(query_text)
 
226
  return result_docs
227
+
228
  # Initialize clients
229
  intention_client = OpenAI(api_key=yi_token, base_url=API_BASE)
230
  embedding_generator = EmbeddingGenerator(model_name=model_name, token=hf_token, intention_client=intention_client)
231
  embedding_function = MyEmbeddingFunction(embedding_generator=embedding_generator)
232
+ chroma_client = initialize_chroma(collection_name="Tonic-instruct", embedding_function=embedding_function)
233
 
234
  def respond(
235
  message,
 
261
 
262
  def upload_documents(files):
263
  for file in files:
264
+ loader = UnstructuredFileLoader(file.name)
265
+ documents = loader.load()
266
+ add_documents_to_chroma(chroma_client, documents, embedding_function)
267
  return "Documents uploaded and processed successfully!"
268
 
269
+
270
  def query_documents(query):
271
+ results = query_chroma(chroma_client, query)
272
+ return "\n\n".join([result.page_content for result in results])
273
 
274
  with gr.Blocks() as demo:
275
  with gr.Tab("Upload Documents"):
requirements.txt CHANGED
@@ -7,6 +7,7 @@ openai
7
  python-dotenv
8
  chromadb
9
  langchain-community
 
10
  unstructured[all-docs]
11
  libmagic
12
  # poppler
 
7
  python-dotenv
8
  chromadb
9
  langchain-community
10
+ langchain-chroma
11
  unstructured[all-docs]
12
  libmagic
13
  # poppler