Spaces:
Running
Running
fahmiaziz98
commited on
Commit
•
26de2cd
1
Parent(s):
84c30b3
init
Browse files- apps/agent/constant.py +28 -1
- apps/agent/multi_query_chain.py +0 -37
- apps/agent/tools.py +29 -14
- apps/agent/utils.py +41 -0
apps/agent/constant.py
CHANGED
@@ -23,4 +23,31 @@ PROMPT = ChatPromptTemplate.from_messages(
|
|
23 |
),
|
24 |
("placeholder", "{messages}")
|
25 |
]
|
26 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
),
|
24 |
("placeholder", "{messages}")
|
25 |
]
|
26 |
+
)
|
27 |
+
|
28 |
+
# list website
|
29 |
+
URLS_XANO = [
|
30 |
+
"https://docs.xano.com/about",
|
31 |
+
"https://releases.xano.com/?_gl=1*sifgtw*_ga*MTI5NTY3MTk5NS4xNzMwNjMzNjY3*_ga_EJWDZRK3CG*MTczMDgwNjg3Mi43LjEuMTczMDgwNjkyMy45LjAuODUyNzA5OTA4",
|
32 |
+
"https://docs.xano.com/onboarding-tutorial-reference",
|
33 |
+
"https://docs.xano.com/faq",
|
34 |
+
"https://docs.xano.com/about",
|
35 |
+
"https://docs.xano.com/what-xano-includes",
|
36 |
+
"https://docs.xano.com/what-xano-includes/instance",
|
37 |
+
"https://docs.xano.com/what-xano-includes/workspace",
|
38 |
+
"https://docs.xano.com/database/triggers",
|
39 |
+
"https://docs.xano.com/fundamentals/the-development-life-cycle",
|
40 |
+
|
41 |
+
]
|
42 |
+
|
43 |
+
URLS_WEWEB = [
|
44 |
+
"https://docs.weweb.io/start-here/welcome.html",
|
45 |
+
"https://docs.weweb.io/start-here/frequently-asked-questions.html",
|
46 |
+
"https://docs.weweb.io/editor/intro-to-the-editor.html",
|
47 |
+
"https://docs.weweb.io/editor/intro-to-html-css.html",
|
48 |
+
"https://docs.weweb.io/editor/how-to-use-the-add-panel.html",
|
49 |
+
"https://docs.weweb.io/editor/logs.html",
|
50 |
+
"https://docs.weweb.io/editor/copilot/import-figma-designs.html",
|
51 |
+
"https://docs.weweb.io/editor/app-settings/app-settings.html",
|
52 |
+
"https://docs.weweb.io/editor/app-settings/pwa.html"
|
53 |
+
]
|
apps/agent/multi_query_chain.py
DELETED
@@ -1,37 +0,0 @@
|
|
1 |
-
from typing import List
|
2 |
-
|
3 |
-
from langchain_core.output_parsers import BaseOutputParser
|
4 |
-
from langchain_core.prompts import PromptTemplate
|
5 |
-
from pydantic import BaseModel, Field
|
6 |
-
from langchain_groq import ChatGroq
|
7 |
-
from apps.agent.constant import GROQ_API_KEY, MODEL_GROQ
|
8 |
-
|
9 |
-
# Output parser will split the LLM result into a list of queries
|
10 |
-
class LineListOutputParser(BaseOutputParser[List[str]]):
|
11 |
-
"""Output parser for a list of lines."""
|
12 |
-
|
13 |
-
def parse(self, text: str) -> List[str]:
|
14 |
-
lines = text.strip().split("\n")
|
15 |
-
return list(filter(None, lines)) # Remove empty lines
|
16 |
-
|
17 |
-
output_parser = LineListOutputParser()
|
18 |
-
llm = ChatGroq(model=MODEL_GROQ, groq_api_key=GROQ_API_KEY, temperature=0.1)
|
19 |
-
|
20 |
-
template = """
|
21 |
-
Your task is to generate 3 different search queries that aim to
|
22 |
-
answer the user question from multiple perspectives. The user questions
|
23 |
-
are focused on Large Language Models, Machine Learning, and related
|
24 |
-
disciplines.
|
25 |
-
Each query MUST tackle the question from a different viewpoint, we
|
26 |
-
want to get a variety of RELEVANT search results.
|
27 |
-
Provide these alternative questions separated by newlines.
|
28 |
-
GENERATE ONLY QUERY! dont add explanation and word
|
29 |
-
Original question: {question}
|
30 |
-
"""
|
31 |
-
|
32 |
-
QUERY_PROMPT = PromptTemplate(
|
33 |
-
input_variables=["question"],
|
34 |
-
template=template,
|
35 |
-
)
|
36 |
-
# Chain
|
37 |
-
llm_chain = QUERY_PROMPT | llm | output_parser
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
apps/agent/tools.py
CHANGED
@@ -1,39 +1,54 @@
|
|
1 |
import os
|
2 |
-
from langchain_community.vectorstores
|
3 |
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
|
4 |
from langchain.retrievers import ContextualCompressionRetriever
|
5 |
from langchain.retrievers.document_compressors import FlashrankRerank
|
|
|
6 |
from langchain_core.tools import tool
|
7 |
-
from
|
8 |
-
|
9 |
-
from apps.agent.multi_query_chain import llm_chain
|
10 |
-
from apps.agent.constant import INDEX_NAME_WEWEB, INDEX_NAME_XANO
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
embeddings = FastEmbedEmbeddings(model_name="jinaai/jina-embeddings-v2-small-en")
|
14 |
compressor = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2")
|
15 |
|
16 |
-
def
|
|
|
17 |
vectorstore = Pinecone.from_existing_index(embedding=embeddings, index_name=index_name)
|
18 |
retriever = vectorstore.as_retriever()
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
retriever=retriever, llm_chain=llm_chain, parser_key="lines"
|
22 |
-
)
|
23 |
reranker_retriever = ContextualCompressionRetriever(
|
24 |
-
base_compressor=compressor, base_retriever=
|
25 |
)
|
26 |
return reranker_retriever
|
27 |
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
@tool
|
32 |
def tool_xano(query: str):
|
33 |
"""
|
34 |
Searches and returns excerpts from the Xano documentation
|
35 |
"""
|
36 |
-
return retriever_xano.
|
37 |
|
38 |
|
39 |
@tool
|
@@ -41,4 +56,4 @@ def tool_weweb(query: str):
|
|
41 |
"""
|
42 |
Searches and returns excerpts from the Weweb documentation
|
43 |
"""
|
44 |
-
return retriever_weweb.
|
|
|
1 |
import os
|
2 |
+
from langchain_community.vectorstores import Pinecone
|
3 |
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
|
4 |
from langchain.retrievers import ContextualCompressionRetriever
|
5 |
from langchain.retrievers.document_compressors import FlashrankRerank
|
6 |
+
from langchain.retrievers import EnsembleRetriever, BM25Retriever
|
7 |
from langchain_core.tools import tool
|
8 |
+
from typing import Any
|
|
|
|
|
|
|
9 |
|
10 |
+
from apps.agent.utils import load_and_split_docs
|
11 |
+
from apps.agent.constant import (
|
12 |
+
INDEX_NAME_WEWEB,
|
13 |
+
INDEX_NAME_XANO,
|
14 |
+
URLS_WEWEB,
|
15 |
+
URLS_XANO,
|
16 |
+
)
|
17 |
|
18 |
embeddings = FastEmbedEmbeddings(model_name="jinaai/jina-embeddings-v2-small-en")
|
19 |
compressor = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2")
|
20 |
|
21 |
+
def ensemble_retriever(index_name: str, docs: Any, embeddings, compressor):
|
22 |
+
# retriever
|
23 |
vectorstore = Pinecone.from_existing_index(embedding=embeddings, index_name=index_name)
|
24 |
retriever = vectorstore.as_retriever()
|
25 |
+
|
26 |
+
# bm25
|
27 |
+
bm25 = BM25Retriever.from_documents(docs)
|
28 |
+
bm25.k = 6
|
29 |
+
ensemble_retriever = EnsembleRetriever(retrievers=[retriever, bm25],
|
30 |
+
weights=[0.6, 0.4])
|
31 |
|
32 |
+
# reranker
|
|
|
|
|
33 |
reranker_retriever = ContextualCompressionRetriever(
|
34 |
+
base_compressor=compressor, base_retriever=ensemble_retriever
|
35 |
)
|
36 |
return reranker_retriever
|
37 |
|
38 |
+
# load data
|
39 |
+
data_xano = load_and_split_docs(URLS_XANO)
|
40 |
+
data_weweb = load_and_split_docs(URLS_WEWEB)
|
41 |
+
|
42 |
+
# create retriever
|
43 |
+
retriever_xano = ensemble_retriever(INDEX_NAME_XANO, data_xano, embeddings, compressor)
|
44 |
+
retriever_weweb = ensemble_retriever(INDEX_NAME_WEWEB, data_weweb, embeddings, compressor)
|
45 |
|
46 |
@tool
|
47 |
def tool_xano(query: str):
|
48 |
"""
|
49 |
Searches and returns excerpts from the Xano documentation
|
50 |
"""
|
51 |
+
return retriever_xano.invoke(query)
|
52 |
|
53 |
|
54 |
@tool
|
|
|
56 |
"""
|
57 |
Searches and returns excerpts from the Weweb documentation
|
58 |
"""
|
59 |
+
return retriever_weweb.invoke(query)
|
apps/agent/utils.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uuid
|
2 |
+
import logging
|
3 |
+
from typing import List
|
4 |
+
from langchain_community.document_loaders import SeleniumURLLoader
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
|
7 |
+
# add logger
|
8 |
+
logging.basicConfig(level=logging.INFO)
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
# get document
|
11 |
+
def load_and_split_docs(urls: List[str]):
|
12 |
+
|
13 |
+
MARKDOWN_SEPARATORS = [
|
14 |
+
"\n#{1,6} ",
|
15 |
+
"```\n",
|
16 |
+
"\n\\*\\*\\*+\n",
|
17 |
+
"\n---+\n",
|
18 |
+
"\n___+\n",
|
19 |
+
"\n\n",
|
20 |
+
"\n",
|
21 |
+
" ",
|
22 |
+
"",
|
23 |
+
]
|
24 |
+
logger.info("Extracting web loader...")
|
25 |
+
loader = SeleniumURLLoader(urls=urls)
|
26 |
+
docs = loader.load()
|
27 |
+
|
28 |
+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
29 |
+
chunk_size=512, # The maximum number of characters in a chunk: we selected this value arbitrarily
|
30 |
+
chunk_overlap=50, # The number of characters to overlap between chunks
|
31 |
+
add_start_index=True, # If `True`, includes chunk's start index in metadata
|
32 |
+
strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
|
33 |
+
separators=MARKDOWN_SEPARATORS,
|
34 |
+
)
|
35 |
+
|
36 |
+
logger.info("Split and documnets...")
|
37 |
+
docs_split = text_splitter.split_documents(docs)
|
38 |
+
for i, doc in enumerate(docs_split):
|
39 |
+
doc.metadata['id'] = str(uuid.uuid4())[:4]
|
40 |
+
doc.metadata['chunk-id'] = str(uuid.uuid4())[-4:]
|
41 |
+
return docs_split
|