Spaces:

Tonic
/

YiJina

Build error

App Files Files Community

Tonic commited on Jul 11

Commit

be9cd13

•

1 Parent(s): 70c5bc9

Revert "chroma langchain fix 1"

Browse files

This reverts commit 70c5bc93401cd8a8a030ca08155fcfeb5906751a.

Files changed (2) hide show

app.py +16 -35
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -113,13 +113,9 @@ from langchain_community.document_loaders import UnstructuredFileLoader
 from chromadb import Documents, EmbeddingFunction, Embeddings
 from chromadb.config import Settings
 from chromadb import HttpClient
-from langchain_chroma import Chroma
 from utils import load_env_variables, parse_and_route
 from globalvars import API_BASE, intention_prompt, tasks, system_message, model_name
-from langchain_core.embeddings import Embeddings
-from chromadb.api.types import EmbeddingFunction, Documents
 os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30'
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 os.environ['CUDA_CACHE_DISABLE'] = '1'
@@ -182,23 +178,6 @@ class EmbeddingGenerator:
             self.clear_cuda_cache()
             return embeddings_list
-class ChromaEmbeddingsAdapter(Embeddings):
-    def __init__(self, ef: EmbeddingFunction):
-        self.ef = ef
-    def embed_documents(self, texts):
-        return self.ef(texts)
-    def embed_query(self, query):
-        return self.ef([query])[0]
-class LangChainEmbeddingAdapter(EmbeddingFunction[Documents]):
-    def __init__(self, ef: Embeddings):
-        self.ef = ef
-    def __call__(self, input: Documents) -> Embeddings:
-        return self.ef.embed_documents(input)
 class MyEmbeddingFunction(EmbeddingFunction):
     def __init__(self, embedding_generator: EmbeddingGenerator):
         self.embedding_generator = embedding_generator
@@ -214,22 +193,25 @@ def load_documents(file_path: str, mode: str = "elements"):
     return [doc.page_content for doc in docs]
 def initialize_chroma(collection_name: str, embedding_function: MyEmbeddingFunction):
-    client = Chroma.from_documents([], ChromaEmbeddingsAdapter(embedding_function))  # Initialize with no documents
-    return client
-def add_documents_to_chroma(client, documents: list, embedding_function: MyEmbeddingFunction):
     for doc in documents:
-        client.add(ids=[str(uuid.uuid1())], documents=[doc], embeddings=embedding_function([doc]))
-def query_chroma(client, query_text: str):
-    result_docs = client.similarity_search(query_text)
     return result_docs
 # Initialize clients
 intention_client = OpenAI(api_key=yi_token, base_url=API_BASE)
 embedding_generator = EmbeddingGenerator(model_name=model_name, token=hf_token, intention_client=intention_client)
 embedding_function = MyEmbeddingFunction(embedding_generator=embedding_generator)
-chroma_client = initialize_chroma(collection_name="Tonic-instruct", embedding_function=embedding_function)
 def respond(
     message,
@@ -261,15 +243,14 @@ def respond(
 def upload_documents(files):
     for file in files:
-        loader = UnstructuredFileLoader(file.name)
-        documents = loader.load()
-        add_documents_to_chroma(chroma_client, documents, embedding_function)
     return "Documents uploaded and processed successfully!"
 def query_documents(query):
-    results = query_chroma(chroma_client, query)
-    return "\n\n".join([result.page_content for result in results])
 with gr.Blocks() as demo:
     with gr.Tab("Upload Documents"):

 from chromadb import Documents, EmbeddingFunction, Embeddings
 from chromadb.config import Settings
 from chromadb import HttpClient
 from utils import load_env_variables, parse_and_route
 from globalvars import API_BASE, intention_prompt, tasks, system_message, model_name
 os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30'
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 os.environ['CUDA_CACHE_DISABLE'] = '1'
             self.clear_cuda_cache()
             return embeddings_list
 class MyEmbeddingFunction(EmbeddingFunction):
     def __init__(self, embedding_generator: EmbeddingGenerator):
         self.embedding_generator = embedding_generator
     return [doc.page_content for doc in docs]
 def initialize_chroma(collection_name: str, embedding_function: MyEmbeddingFunction):
+    client = chromadb.HttpClient(host='localhost', port=8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))
+    client.reset()  # resets the database
+    collection = client.create_collection(collection_name)
+    return client, collection
+def add_documents_to_chroma(client, collection, documents: list, embedding_function: MyEmbeddingFunction):
     for doc in documents:
+        collection.add(ids=[str(uuid.uuid1())], documents=[doc], embeddings=embedding_function([doc]))
+def query_chroma(client, collection_name: str, query_text: str, embedding_function: MyEmbeddingFunction):
+    db = Chroma(client=client, collection_name=collection_name, embedding_function=embedding_function)
+    result_docs = db.similarity_search(query_text)
     return result_docs
 # Initialize clients
 intention_client = OpenAI(api_key=yi_token, base_url=API_BASE)
 embedding_generator = EmbeddingGenerator(model_name=model_name, token=hf_token, intention_client=intention_client)
 embedding_function = MyEmbeddingFunction(embedding_generator=embedding_generator)
+chroma_client, chroma_collection = initialize_chroma(collection_name="Tonic-instruct", embedding_function=embedding_function)
 def respond(
     message,
 def upload_documents(files):
     for file in files:
+        loader = DocumentLoader(file.name)
+        documents = loader.load_documents()
+        chroma_manager.add_documents(documents)
     return "Documents uploaded and processed successfully!"
 def query_documents(query):
+    results = chroma_manager.query(query)
+    return "\n\n".join([result.content for result in results])
 with gr.Blocks() as demo:
     with gr.Tab("Upload Documents"):

requirements.txt CHANGED Viewed

@@ -7,7 +7,6 @@ openai
 python-dotenv
 chromadb
 langchain-community
-langchain-chroma
 unstructured[all-docs]
 libmagic
 # poppler

 python-dotenv
 chromadb
 langchain-community
 unstructured[all-docs]
 libmagic
 # poppler