Spaces:

fahmiaziz
/

ai-agent-prototype

Running

App Files Files Community

fahmiaziz98 commited on 19 days ago

Commit

26de2cd

•

1 Parent(s): 84c30b3

init

Browse files

Files changed (4) hide show

apps/agent/constant.py +28 -1
apps/agent/multi_query_chain.py +0 -37
apps/agent/tools.py +29 -14
apps/agent/utils.py +41 -0

apps/agent/constant.py CHANGED Viewed

@@ -23,4 +23,31 @@ PROMPT = ChatPromptTemplate.from_messages(
         ),
         ("placeholder", "{messages}")
     ]
-)

         ),
         ("placeholder", "{messages}")
     ]
+)
+# list website
+URLS_XANO = [
+    "https://docs.xano.com/about",
+    "https://releases.xano.com/?_gl=1*sifgtw*_ga*MTI5NTY3MTk5NS4xNzMwNjMzNjY3*_ga_EJWDZRK3CG*MTczMDgwNjg3Mi43LjEuMTczMDgwNjkyMy45LjAuODUyNzA5OTA4",
+    "https://docs.xano.com/onboarding-tutorial-reference",
+    "https://docs.xano.com/faq",
+    "https://docs.xano.com/about",
+    "https://docs.xano.com/what-xano-includes",
+    "https://docs.xano.com/what-xano-includes/instance",
+    "https://docs.xano.com/what-xano-includes/workspace",
+    "https://docs.xano.com/database/triggers",
+    "https://docs.xano.com/fundamentals/the-development-life-cycle",
+]
+URLS_WEWEB = [
+    "https://docs.weweb.io/start-here/welcome.html",
+    "https://docs.weweb.io/start-here/frequently-asked-questions.html",
+    "https://docs.weweb.io/editor/intro-to-the-editor.html",
+    "https://docs.weweb.io/editor/intro-to-html-css.html",
+    "https://docs.weweb.io/editor/how-to-use-the-add-panel.html",
+    "https://docs.weweb.io/editor/logs.html",
+    "https://docs.weweb.io/editor/copilot/import-figma-designs.html",
+    "https://docs.weweb.io/editor/app-settings/app-settings.html",
+    "https://docs.weweb.io/editor/app-settings/pwa.html"
+]

apps/agent/multi_query_chain.py DELETED Viewed

@@ -1,37 +0,0 @@
-from typing import List
-from langchain_core.output_parsers import BaseOutputParser
-from langchain_core.prompts import PromptTemplate
-from pydantic import BaseModel, Field
-from langchain_groq import ChatGroq
-from apps.agent.constant import GROQ_API_KEY, MODEL_GROQ
-# Output parser will split the LLM result into a list of queries
-class LineListOutputParser(BaseOutputParser[List[str]]):
-    """Output parser for a list of lines."""
-    def parse(self, text: str) -> List[str]:
-        lines = text.strip().split("\n")
-        return list(filter(None, lines))  # Remove empty lines
-output_parser = LineListOutputParser()
-llm = ChatGroq(model=MODEL_GROQ, groq_api_key=GROQ_API_KEY, temperature=0.1)
-template = """
-Your task is to generate 3 different search queries that aim to
-answer the user question from multiple perspectives. The user questions
-are focused on Large Language Models, Machine Learning, and related
-disciplines.
-Each query MUST tackle the question from a different viewpoint, we
-want to get a variety of RELEVANT search results.
-Provide these alternative questions separated by newlines.
-GENERATE ONLY QUERY! dont add explanation and word
-Original question: {question}
-"""
-QUERY_PROMPT = PromptTemplate(
-    input_variables=["question"],
-    template=template,
-)
-# Chain
-llm_chain = QUERY_PROMPT | llm | output_parser

apps/agent/tools.py CHANGED Viewed

@@ -1,39 +1,54 @@
 import os
-from langchain_community.vectorstores.pinecone import Pinecone
 from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
 from langchain.retrievers import ContextualCompressionRetriever
 from langchain.retrievers.document_compressors import FlashrankRerank
 from langchain_core.tools import tool
-from langchain.retrievers.multi_query import MultiQueryRetriever
-from apps.agent.multi_query_chain import llm_chain
-from apps.agent.constant import INDEX_NAME_WEWEB, INDEX_NAME_XANO
 embeddings = FastEmbedEmbeddings(model_name="jinaai/jina-embeddings-v2-small-en")
 compressor = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2")
-def multiquery_retriever(index_name: str, embeddings, compressor) -> ContextualCompressionRetriever:
     vectorstore = Pinecone.from_existing_index(embedding=embeddings, index_name=index_name)
     retriever = vectorstore.as_retriever()
-    multi_retriever = MultiQueryRetriever(
-        retriever=retriever, llm_chain=llm_chain, parser_key="lines"
-    )
     reranker_retriever = ContextualCompressionRetriever(
-        base_compressor=compressor, base_retriever=multi_retriever
     )
     return reranker_retriever
-retriever_xano = multiquery_retriever(INDEX_NAME_XANO, embeddings, compressor)
-retriever_weweb = multiquery_retriever(INDEX_NAME_WEWEB, embeddings, compressor)
 @tool
 def tool_xano(query: str):
     """
     Searches and returns excerpts from the Xano documentation
     """
-    return retriever_xano.get_relevant_documents(query)
 @tool
@@ -41,4 +56,4 @@ def tool_weweb(query: str):
     """
     Searches and returns excerpts from the Weweb documentation
     """
-    return retriever_weweb.get_relevant_documents(query)

 import os
+from langchain_community.vectorstores import Pinecone
 from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
 from langchain.retrievers import ContextualCompressionRetriever
 from langchain.retrievers.document_compressors import FlashrankRerank
+from langchain.retrievers import EnsembleRetriever, BM25Retriever
 from langchain_core.tools import tool
+from typing import Any
+from apps.agent.utils import load_and_split_docs
+from apps.agent.constant import (
+    INDEX_NAME_WEWEB,
+    INDEX_NAME_XANO,
+    URLS_WEWEB,
+    URLS_XANO,
+)
 embeddings = FastEmbedEmbeddings(model_name="jinaai/jina-embeddings-v2-small-en")
 compressor = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2")
+def ensemble_retriever(index_name: str, docs: Any, embeddings, compressor):
+    # retriever
     vectorstore = Pinecone.from_existing_index(embedding=embeddings, index_name=index_name)
     retriever = vectorstore.as_retriever()
+    # bm25
+    bm25 = BM25Retriever.from_documents(docs)
+    bm25.k = 6
+    ensemble_retriever = EnsembleRetriever(retrievers=[retriever, bm25],
+                                       weights=[0.6, 0.4])
+    # reranker
     reranker_retriever = ContextualCompressionRetriever(
+        base_compressor=compressor, base_retriever=ensemble_retriever
     )
     return reranker_retriever
+# load data
+data_xano = load_and_split_docs(URLS_XANO)
+data_weweb = load_and_split_docs(URLS_WEWEB)
+# create retriever
+retriever_xano = ensemble_retriever(INDEX_NAME_XANO, data_xano, embeddings, compressor)
+retriever_weweb = ensemble_retriever(INDEX_NAME_WEWEB, data_weweb, embeddings, compressor)
 @tool
 def tool_xano(query: str):
     """
     Searches and returns excerpts from the Xano documentation
     """
+    return retriever_xano.invoke(query)
 @tool
     """
     Searches and returns excerpts from the Weweb documentation
     """
+    return retriever_weweb.invoke(query)

apps/agent/utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import uuid
+import logging
+from typing import List
+from langchain_community.document_loaders import SeleniumURLLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+# add logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# get document
+def load_and_split_docs(urls: List[str]):
+    MARKDOWN_SEPARATORS = [
+        "\n#{1,6} ",
+        "```\n",
+        "\n\\*\\*\\*+\n",
+        "\n---+\n",
+        "\n___+\n",
+        "\n\n",
+        "\n",
+        " ",
+        "",
+    ]
+    logger.info("Extracting web loader...")
+    loader = SeleniumURLLoader(urls=urls)
+    docs = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+        chunk_size=512,  # The maximum number of characters in a chunk: we selected this value arbitrarily
+        chunk_overlap=50,  # The number of characters to overlap between chunks
+        add_start_index=True,  # If `True`, includes chunk's start index in metadata
+        strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
+        separators=MARKDOWN_SEPARATORS,
+    )
+    logger.info("Split and documnets...")
+    docs_split = text_splitter.split_documents(docs)
+    for i, doc in enumerate(docs_split):
+        doc.metadata['id'] = str(uuid.uuid4())[:4]
+        doc.metadata['chunk-id'] = str(uuid.uuid4())[-4:]
+    return docs_split