Spaces:

Tonic
/

YiJina

Build error

App Files Files Community

Tonic commited on Jul 11

Commit

70c5bc9

•

1 Parent(s): a4669e2

chroma langchain fix 1

Browse files

Files changed (2) hide show

app.py +35 -16
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -113,9 +113,13 @@ from langchain_community.document_loaders import UnstructuredFileLoader
 from chromadb import Documents, EmbeddingFunction, Embeddings
 from chromadb.config import Settings
 from chromadb import HttpClient
 from utils import load_env_variables, parse_and_route
 from globalvars import API_BASE, intention_prompt, tasks, system_message, model_name
 os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30'
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 os.environ['CUDA_CACHE_DISABLE'] = '1'
@@ -178,6 +182,23 @@ class EmbeddingGenerator:
             self.clear_cuda_cache()
             return embeddings_list
 class MyEmbeddingFunction(EmbeddingFunction):
     def __init__(self, embedding_generator: EmbeddingGenerator):
         self.embedding_generator = embedding_generator
@@ -193,25 +214,22 @@ def load_documents(file_path: str, mode: str = "elements"):
     return [doc.page_content for doc in docs]
 def initialize_chroma(collection_name: str, embedding_function: MyEmbeddingFunction):
-    client = chromadb.HttpClient(host='localhost', port=8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))
-    client.reset()  # resets the database
-    collection = client.create_collection(collection_name)
-    return client, collection
-def add_documents_to_chroma(client, collection, documents: list, embedding_function: MyEmbeddingFunction):
     for doc in documents:
-        collection.add(ids=[str(uuid.uuid1())], documents=[doc], embeddings=embedding_function([doc]))
-def query_chroma(client, collection_name: str, query_text: str, embedding_function: MyEmbeddingFunction):
-    db = Chroma(client=client, collection_name=collection_name, embedding_function=embedding_function)
-    result_docs = db.similarity_search(query_text)
     return result_docs
 # Initialize clients
 intention_client = OpenAI(api_key=yi_token, base_url=API_BASE)
 embedding_generator = EmbeddingGenerator(model_name=model_name, token=hf_token, intention_client=intention_client)
 embedding_function = MyEmbeddingFunction(embedding_generator=embedding_generator)
-chroma_client, chroma_collection = initialize_chroma(collection_name="Tonic-instruct", embedding_function=embedding_function)
 def respond(
     message,
@@ -243,14 +261,15 @@ def respond(
 def upload_documents(files):
     for file in files:
-        loader = DocumentLoader(file.name)
-        documents = loader.load_documents()
-        chroma_manager.add_documents(documents)
     return "Documents uploaded and processed successfully!"
 def query_documents(query):
-    results = chroma_manager.query(query)
-    return "\n\n".join([result.content for result in results])
 with gr.Blocks() as demo:
     with gr.Tab("Upload Documents"):

 from chromadb import Documents, EmbeddingFunction, Embeddings
 from chromadb.config import Settings
 from chromadb import HttpClient
+from langchain_chroma import Chroma
 from utils import load_env_variables, parse_and_route
 from globalvars import API_BASE, intention_prompt, tasks, system_message, model_name
+from langchain_core.embeddings import Embeddings
+from chromadb.api.types import EmbeddingFunction, Documents
 os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30'
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 os.environ['CUDA_CACHE_DISABLE'] = '1'
             self.clear_cuda_cache()
             return embeddings_list
+class ChromaEmbeddingsAdapter(Embeddings):
+    def __init__(self, ef: EmbeddingFunction):
+        self.ef = ef
+    def embed_documents(self, texts):
+        return self.ef(texts)
+    def embed_query(self, query):
+        return self.ef([query])[0]
+class LangChainEmbeddingAdapter(EmbeddingFunction[Documents]):
+    def __init__(self, ef: Embeddings):
+        self.ef = ef
+    def __call__(self, input: Documents) -> Embeddings:
+        return self.ef.embed_documents(input)
 class MyEmbeddingFunction(EmbeddingFunction):
     def __init__(self, embedding_generator: EmbeddingGenerator):
         self.embedding_generator = embedding_generator
     return [doc.page_content for doc in docs]
 def initialize_chroma(collection_name: str, embedding_function: MyEmbeddingFunction):
+    client = Chroma.from_documents([], ChromaEmbeddingsAdapter(embedding_function))  # Initialize with no documents
+    return client
+def add_documents_to_chroma(client, documents: list, embedding_function: MyEmbeddingFunction):
     for doc in documents:
+        client.add(ids=[str(uuid.uuid1())], documents=[doc], embeddings=embedding_function([doc]))
+def query_chroma(client, query_text: str):
+    result_docs = client.similarity_search(query_text)
     return result_docs
 # Initialize clients
 intention_client = OpenAI(api_key=yi_token, base_url=API_BASE)
 embedding_generator = EmbeddingGenerator(model_name=model_name, token=hf_token, intention_client=intention_client)
 embedding_function = MyEmbeddingFunction(embedding_generator=embedding_generator)
+chroma_client = initialize_chroma(collection_name="Tonic-instruct", embedding_function=embedding_function)
 def respond(
     message,
 def upload_documents(files):
     for file in files:
+        loader = UnstructuredFileLoader(file.name)
+        documents = loader.load()
+        add_documents_to_chroma(chroma_client, documents, embedding_function)
     return "Documents uploaded and processed successfully!"
 def query_documents(query):
+    results = query_chroma(chroma_client, query)
+    return "\n\n".join([result.page_content for result in results])
 with gr.Blocks() as demo:
     with gr.Tab("Upload Documents"):

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ openai
 python-dotenv
 chromadb
 langchain-community
 unstructured[all-docs]
 libmagic
 # poppler

 python-dotenv
 chromadb
 langchain-community
+langchain-chroma
 unstructured[all-docs]
 libmagic
 # poppler