Pecximenes commited on
Commit
853a071
1 Parent(s): 9e44228

Adding v1 of rag-agent

Browse files
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.8-slim
2
+
3
+ ENV APP_HOME=/app
4
+ WORKDIR ${APP_HOME}
5
+
6
+ COPY requirements.txt ./
7
+
8
+ COPY ./agente ./
9
+
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ EXPOSE 7860
13
+
14
+ CMD ["streamlit", "run", "interface/chatbot.py", "--server.port=7860"]
agente/.env-example ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OPENAI_API_KEY="SUA-KEY"
2
+ OPENAI_MODEL_CHAT="gpt-3.5-turbo"
3
+ OPENAI_MODEL_EMBEDDING="text-embedding-3-small"
4
+
5
+ COLLECTION_NAME="sgd"
6
+
7
+ EDGEDB_HOST="localhost"
8
+ EDGEDB_USER="sgd"
9
+ EDGEDB_PASSWORD="secret"
10
+ EDGEDB_PORT="5656"
11
+
12
+ QDRANT_HOST="localhost"
13
+ QDRANT_PORT="6333"
14
+
15
+ REDIS_HOST="localhost"
16
+ REDIS_PORT="6379"
17
+
18
+ HUGGINGFACE_API_KEY="SUA-KEY"
agente/README.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ Para testar prototipo
3
+
4
+ ```
5
+ streamlit run interface/main.py
6
+ ```
agente/WORKDIR/.gitkeep ADDED
File without changes
agente/__init__.py ADDED
File without changes
agente/agent.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from openai import OpenAI
4
+ from qdrant_client import QdrantClient
5
+ from pipelines.message import send_message
6
+ import redis
7
+
8
+ conversation_chat = []
9
+
10
+
11
+ def run():
12
+ load_dotenv()
13
+
14
+ try:
15
+ oa_client = OpenAI(
16
+ api_key=os.environ.get("OPENAI_API_KEY")
17
+ )
18
+ print("✅ Conectado a OpenAI.")
19
+
20
+ qdrant_client = QdrantClient(
21
+ host=os.environ.get("QDRANT_HOST"),
22
+ port=os.environ.get("QDRANT_PORT")
23
+ )
24
+ print("✅ Conectado ao Qdrant.")
25
+
26
+ redis_client = redis.Redis(
27
+ host=os.environ.get("REDIS_HOST"),
28
+ port=os.environ.get("REDIS_PORT"),
29
+ decode_responses=True
30
+ )
31
+ print("✅ Conectado ao Redis.")
32
+
33
+ while True:
34
+ prompt = input("Digite sua pergunta: ")
35
+
36
+ embedding = oa_client.embeddings.create(
37
+ input=[prompt],
38
+ model=os.environ.get("OPENAI_MODEL_EMBEDDING")
39
+ ).data[0].embedding
40
+
41
+ child_texts = qdrant_client.search(
42
+ collection_name=os.environ.get("COLLECTION_NAME"),
43
+ query_vector=embedding,
44
+ limit=3
45
+ )
46
+
47
+ print("--------- Child text ---------")
48
+ print(child_texts)
49
+
50
+ contexts = []
51
+
52
+ for child_text in child_texts:
53
+ parent_text = redis_client.hgetall(
54
+ child_text[0].payload["parent_id"]
55
+ )
56
+ context = {
57
+ "content": parent_text["content"],
58
+ "url": parent_text["url"]
59
+ }
60
+ contexts.append(context)
61
+
62
+ print("--------- Contexts ---------")
63
+ print(contexts)
64
+
65
+ stream_response = send_message(
66
+ oa_client,
67
+ context,
68
+ prompt,
69
+ conversation_chat
70
+ )
71
+
72
+ print("--------- Response Agent ---------")
73
+ response = ""
74
+ for chunk in stream_response:
75
+ if chunk.choices[0].delta.content is not None:
76
+ response += chunk.choices[0].delta.content
77
+ print(chunk.choices[0].delta.content, end="")
78
+ conversation_chat.append({
79
+ "role": "assistant",
80
+ "content": response
81
+ })
82
+
83
+ is_exit = input("\nDeseja sair? (s/n): ")
84
+ if is_exit == "s":
85
+ break
86
+ except Exception as error:
87
+ print(f"❌ Erro: {error}")
88
+
89
+
90
+ if __name__ == "__main__":
91
+ run()
agente/interface/chatbot.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from dotenv import load_dotenv
4
+ import edgedb
5
+ from openai import OpenAI
6
+ from qdrant_client import QdrantClient
7
+ import streamlit as st
8
+
9
+ # Add the parent directory to the Python path
10
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11
+
12
+ from pipelines.message import send_message # noqa
13
+
14
+ st.set_page_config(page_title="Carlos AI Agent")
15
+
16
+
17
+ load_dotenv()
18
+
19
+
20
+ @st.cache_resource
21
+ def connect_to_services():
22
+ oa_client = OpenAI(
23
+ api_key=os.environ.get("OPENAI_API_KEY")
24
+ )
25
+
26
+ qdrant_client = QdrantClient(
27
+ url=os.environ.get("QDRANT_URL"),
28
+ api_key=os.environ.get("QDRANT_KEY")
29
+ )
30
+
31
+ edgedb_client = edgedb.create_client()
32
+
33
+ return oa_client, qdrant_client, edgedb_client
34
+
35
+
36
+ def send_message_for_ai(prompt, oa_client, qdrant_client, edgedb_client):
37
+ embedding = oa_client.embeddings.create(
38
+ input=[prompt],
39
+ model=os.environ.get("OPENAI_MODEL_EMBEDDING")
40
+ ).data[0].embedding
41
+
42
+ child_texts = qdrant_client.search(
43
+ collection_name=os.environ.get("COLLECTION_NAME"),
44
+ query_vector=embedding,
45
+ limit=3
46
+ )
47
+
48
+ contexts = []
49
+
50
+ for child_text in child_texts:
51
+ parent_text = edgedb_client.query('''
52
+ SELECT Pattern {
53
+ content,
54
+ url,
55
+ parent_id
56
+ }
57
+ FILTER .id = <uuid>$parent_id
58
+ ''', parent_id=child_text.payload["parent_id"])[0]
59
+ context = {
60
+ "content": parent_text.content,
61
+ "url": parent_text.url,
62
+ "parent_id": parent_text.parent_id
63
+ }
64
+ contexts.append(context)
65
+
66
+ # system_msg.write(
67
+ # f"""Contexto: {contexts}""")
68
+
69
+ stream_response = send_message(
70
+ oa_client,
71
+ contexts,
72
+ prompt,
73
+ []
74
+ )
75
+
76
+ return stream_response
77
+
78
+
79
+ oa_client, qdrant_client, edgedb_client = connect_to_services()
80
+
81
+
82
+ def display_chat():
83
+ for msg in st.session_state.messages:
84
+ st.chat_message(msg["role"]).write(msg["content"])
85
+
86
+
87
+ def sidebar_content():
88
+ st.image('https://www.gov.br/++theme++padrao_govbr/img/govbr-logo-large.png', width=200)
89
+ st.header("Tópicos frequentes")
90
+
91
+ # Botões de exemplo
92
+ topics = [
93
+ "Niveis da conta govbr.",
94
+ "Dúvidas no reconhecimento facial.",
95
+ "Dúvidas na autenticação dos bancos.",
96
+ "Dúvidas para aumentar o nível com a cin."
97
+ ]
98
+
99
+ for topic in topics:
100
+ st.button(topic)
101
+
102
+ # Espaços em branco para organização
103
+ for _ in range(5):
104
+ st.write("")
105
+
106
+ # Botão centralizado
107
+ col1, col2, col3 = st.columns([1, 1, 1])
108
+ with col2:
109
+ st.button("VOLTAR")
110
+
111
+
112
+ # Função principal de processamento de input do chat
113
+ def process_user_input():
114
+ if prompt := st.chat_input(placeholder="Digite sua mensagem"):
115
+ # Armazenar mensagem do usuário
116
+ st.session_state.messages.append({"role": "user", "content": prompt})
117
+ st.chat_message("user").write(prompt)
118
+
119
+ # Simulação de resposta do assistente
120
+ response = send_message_for_ai(
121
+ prompt, oa_client, qdrant_client, edgedb_client)
122
+
123
+ # Armazenar e exibir resposta do assistente
124
+ st.session_state.messages.append({"role": "assistant", "content": response})
125
+ st.chat_message("assistant").write(response)
126
+
127
+
128
+ #system_msg = st.chat_message("system")
129
+
130
+
131
+ # Configuração inicial
132
+ if "messages" not in st.session_state:
133
+ st.session_state["messages"] = [{"role": "assistant", "content": "Como eu posso ajudar?"}]
134
+
135
+ # Exibição da barra lateral
136
+ with st.sidebar:
137
+ sidebar_content()
138
+
139
+ # Exibição do título e subtítulo
140
+ st.title("Bem-vindo à ajuda do gov.br")
141
+ st.caption("💬 Lorem ipsum dolor sit amet, consectetur adipiscing elit.")
142
+
143
+ # Exibição do chat
144
+ display_chat()
145
+
146
+ # Processamento da entrada do usuário no chat
147
+ process_user_input()
148
+
149
+
150
+
agente/interface/main.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from dotenv import load_dotenv
4
+ from openai import OpenAI
5
+ from qdrant_client import QdrantClient
6
+ import redis
7
+ import streamlit as st
8
+
9
+ # Add the parent directory to the Python path
10
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11
+
12
+ from pipelines.message import send_message # noqa
13
+
14
+ st.set_page_config(page_title="Carlos AI Agent")
15
+
16
+
17
+ load_dotenv()
18
+
19
+
20
+ @st.cache_resource
21
+ def connect_to_services():
22
+ oa_client = OpenAI(
23
+ api_key=os.environ.get("OPENAI_API_KEY")
24
+ )
25
+
26
+ qdrant_client = QdrantClient(
27
+ host=os.environ.get("QDRANT_HOST"),
28
+ port=os.environ.get("QDRANT_PORT")
29
+ )
30
+
31
+ redis_client = redis.Redis(
32
+ host=os.environ.get("REDIS_HOST"),
33
+ port=os.environ.get("REDIS_PORT"),
34
+ decode_responses=True
35
+ )
36
+
37
+ return oa_client, qdrant_client, redis_client
38
+
39
+
40
+ def send_message_for_ai(prompt, oa_client, qdrant_client, redis_client):
41
+ embedding = oa_client.embeddings.create(
42
+ input=[prompt],
43
+ model=os.environ.get("OPENAI_MODEL_EMBEDDING")
44
+ ).data[0].embedding
45
+
46
+ child_texts = qdrant_client.search(
47
+ collection_name=os.environ.get("COLLECTION_NAME"),
48
+ query_vector=embedding,
49
+ limit=3
50
+ )
51
+
52
+ contexts = []
53
+
54
+ for child_text in child_texts:
55
+ parent_text = redis_client.hgetall(
56
+ child_text.payload["parent_id"]
57
+ )
58
+ context = {
59
+ "content": parent_text["content"],
60
+ "url": parent_text["url"],
61
+ "parent_id": parent_text["parent_id"]
62
+ }
63
+ contexts.append(context)
64
+
65
+ system_msg.write(
66
+ f"""Contexto: {contexts}""")
67
+
68
+ stream_response = send_message(
69
+ oa_client,
70
+ contexts,
71
+ prompt,
72
+ []
73
+ )
74
+
75
+ return stream_response
76
+
77
+
78
+ oa_client, qdrant_client, redis_client = connect_to_services()
79
+
80
+ st.write("# Carlos AI Agent")
81
+
82
+ prompt = st.chat_input("Digite sua pergunta:")
83
+ user_msg = st.chat_message("user")
84
+ assistant_msg = st.chat_message("assistant")
85
+ system_msg = st.chat_message("system")
86
+
87
+ if prompt:
88
+ user_msg.write(prompt)
89
+ response = send_message_for_ai(
90
+ prompt, oa_client, qdrant_client, redis_client)
91
+ assistant_msg.write_stream(response)
agente/memory/.gitkeep ADDED
File without changes
agente/pipelines/.gitkeep ADDED
File without changes
agente/pipelines/__init__.py ADDED
File without changes
agente/pipelines/message.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import Client
3
+ from utils.file import File
4
+
5
+
6
+ def send_message(oa_client: Client, context, question, conversation_chat):
7
+ if len(conversation_chat) == 0:
8
+ system_message = File("prompts/system.md")
9
+ conversation_chat.append({
10
+ "role": "system",
11
+ "content": system_message
12
+ })
13
+
14
+ conversation_chat.append({
15
+ "role": "user",
16
+ "content": f"Context: {context}\nQuestion: {question}"
17
+ })
18
+
19
+ return oa_client.chat.completions.create(
20
+ model=os.environ.get("OPENAI_MODEL_CHAT"),
21
+ messages=conversation_chat,
22
+ stream=True
23
+ )
24
+
agente/pipelines/oac/oac.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import Client
2
+
3
+
4
+ def get_description_of_image(oa_client: Client, image_url):
5
+ completion = oa_client.chat.completions.create(
6
+ model="gpt-4o-mini",
7
+ messages=[
8
+ {
9
+ "role": "system",
10
+ "content": """
11
+ You are an image descriptor who will describe images using the Brazilian Portuguese language.
12
+ This description should be short and to the point, with a maximum of 50 characters, to be placed in the alt tag of an HTML.
13
+ """ # noqa
14
+ },
15
+ {
16
+ "role": "user",
17
+ "content": [
18
+ {
19
+ "type": "text",
20
+ "text": "What's in this image?"
21
+ },
22
+ {
23
+ "type": "image_url",
24
+ "image_url": {
25
+ "url": f"{image_url}"
26
+ }
27
+ }
28
+ ]
29
+ }
30
+ ],
31
+ max_tokens=300
32
+ )
33
+ return completion.choices[0].message.content
agente/pipelines/rag.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sqlite3
3
+ import time
4
+ from typing import List
5
+
6
+ from dotenv import load_dotenv
7
+ from openai import OpenAI
8
+ from qdrant_client import QdrantClient
9
+
10
+ from utils.markdown import generate_image_description_with_empty_description
11
+ from utils.cli import \
12
+ print_execution_time, start_loading_animation, stop_loading_animation
13
+ from splitter import splitter
14
+ from qdrant_client.models import Distance, VectorParams
15
+ from qdrant_client.models import PointStruct
16
+
17
+
18
+ HEADERS_TO_SPLIT_ON = [
19
+ ("#", "Header 1"),
20
+ ("##", "Header 2"),
21
+ ("###", "Header 3"),
22
+ ("####", "Header 4"),
23
+ ("#####", "Header 5"),
24
+ ("######", "Header 6"),
25
+ ]
26
+
27
+
28
+ def safe_join(thread, done: List[bool]):
29
+ if thread is not None and thread.is_alive():
30
+ done[0] = True
31
+ thread.join()
32
+
33
+
34
+ def run():
35
+ loading_split = None
36
+ loading_image_description = None
37
+ loading_embedding = None
38
+ done_split = [False]
39
+ done_image_description = [False]
40
+ done_embeding = [False]
41
+
42
+ try:
43
+ load_dotenv()
44
+
45
+ start_time = time.time()
46
+ print("➗ Começando o RAG...")
47
+
48
+ conn = sqlite3.connect('../meu_banco.db')
49
+ cursor = conn.cursor()
50
+ print("✅ Conectado ao sqlite.")
51
+
52
+ qdrant_client = QdrantClient(":memory:")
53
+ print("✅ Conectado ao qdrant.")
54
+
55
+ oa_client = OpenAI(
56
+ api_key=os.environ.get("OPENAI_API_KEY")
57
+ )
58
+ print("✅ Conectado a OpenAI.")
59
+
60
+ if qdrant_client.collection_exists(os.environ.get("COLLECTION_NAME")):
61
+ qdrant_client.delete_collection(
62
+ collection_name=os.environ.get("COLLECTION_NAME")
63
+ )
64
+ qdrant_client.create_collection(
65
+ collection_name=os.environ.get("COLLECTION_NAME"),
66
+ vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
67
+ )
68
+ print("✅ Collection criada.")
69
+
70
+ cursor.execute('''
71
+ SELECT w.id, w.url, w.text_content, w.relative_path, w.hyperrefs,
72
+ i.name AS image_name, i.path AS image_path, i.url AS image_url, i.hyperlink AS image_hyperlink, i.alt AS image_alt,
73
+ v.name AS video_name, v.url AS video_url, v.hyperlink AS video_hyperlink, v.alt AS video_alt
74
+ FROM Website w
75
+ LEFT JOIN Image i ON i.website_id = w.id
76
+ LEFT JOIN Video v ON v.website_id = w.id
77
+ ''')
78
+ web_sites_bruto = cursor.fetchall()
79
+ web_sites = []
80
+ for web_site_bruto in web_sites_bruto:
81
+ web_site = {
82
+ "text_content": web_site_bruto[2],
83
+ "text_id": web_site_bruto[0],
84
+ "url": web_site_bruto[1]
85
+ }
86
+ web_sites.append(web_site)
87
+
88
+ loading_image_description = start_loading_animation(
89
+ done_image_description,
90
+ "Gerando Descrições de imagens caso não tenha..."
91
+ )
92
+
93
+ texts_with_images_descriptions, error_images_description = \
94
+ generate_image_description_with_empty_description(
95
+ oa_client,
96
+ web_sites
97
+ )
98
+ stop_loading_animation(
99
+ done_image_description,
100
+ loading_image_description
101
+ )
102
+ print("✅ Descrição de imagens geradas.")
103
+
104
+ texts_parent_splitted, texts_child_splitted = splitter.split(
105
+ texts_with_images_descriptions,
106
+ HEADERS_TO_SPLIT_ON
107
+ )
108
+ print("✅ Textos dividos.")
109
+
110
+ loading_split = start_loading_animation(
111
+ done_split,
112
+ "Dividindo textos..."
113
+ )
114
+ for text_parent_splitted in texts_parent_splitted:
115
+ cursor.execute('''
116
+ CREATE TABLE IF NOT EXISTS ParentText
117
+ (
118
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
119
+ content TEXT,
120
+ parent_id INTEGER,
121
+ url TEXT,
122
+ FOREIGN KEY (parent_id) REFERENCES Website(id)
123
+ );
124
+ ''')
125
+ cursor.execute('''
126
+ INSERT INTO ParentText (content, parent_id, url)
127
+ VALUES (?, ?, ?);
128
+ ''', (
129
+ text_parent_splitted["content"].page_content,
130
+ text_parent_splitted["parent_id"],
131
+ text_parent_splitted["url"]
132
+ ))
133
+ stop_loading_animation(done_split, loading_split)
134
+ print("✅ Texto pai salvo no sqlite.")
135
+
136
+ count_embeddings = 0
137
+ total_child_splitted = len(texts_child_splitted)
138
+ for text_child_splitted in texts_child_splitted:
139
+ done_embeding = [False]
140
+ loading_embedding = start_loading_animation(
141
+ done_embeding,
142
+ f"""Gerando embeddings e salvando no qdrant: {count_embeddings} de {total_child_splitted}""" # noqa
143
+ )
144
+ embedding = oa_client.embeddings.create(
145
+ input=[text_child_splitted["content"].page_content],
146
+ model=os.environ.get("OPENAI_MODEL_EMBEDDING")
147
+ ).data[0].embedding
148
+ qdrant_client.upsert(
149
+ collection_name=os.environ.get("COLLECTION_NAME"),
150
+ points=[PointStruct(
151
+ id=str(text_child_splitted["id"]),
152
+ vector=embedding,
153
+ payload={
154
+ "content": text_child_splitted["content"].page_content,
155
+ "parent_id": text_child_splitted["parent_id"],
156
+ "type": "text"
157
+ }
158
+ )]
159
+ )
160
+ count_embeddings += 1
161
+ stop_loading_animation(done_embeding, loading_embedding)
162
+ print("✅ Texto filho salvo no qdrant.")
163
+
164
+ print("✅ RAG finalizado.")
165
+
166
+ print(
167
+ f"""
168
+ 📊 Relatório:\n
169
+ \t Tempo de execução: {print_execution_time(start_time)}\n
170
+ \t Textos Filhos e Embedding gerados: {len(texts_child_splitted)}\n
171
+ \t Textos pai gerados: {len(texts_parent_splitted)}\n
172
+ \t WebSites recuperados da base: {len(web_sites)}\n
173
+ \t Erros ao gerar descrição de imagens: {len(error_images_description)}\n
174
+ """) # noqa
175
+
176
+ except Exception as error:
177
+ print(f"❌ Erro: {error}")
178
+ finally:
179
+ safe_join(loading_split, done_split)
180
+ safe_join(loading_image_description, done_image_description)
181
+ safe_join(loading_embedding, done_embeding)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ run()
agente/pipelines/splitter/splitter.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import \
2
+ MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
3
+ import uuid
4
+
5
+
6
+ def split(texts, headers_to_split_on):
7
+ """
8
+ Dividir os textos em chunks utilizando a lógica
9
+ "Parent Document Retriever",
10
+ retornando os chunks e os documentos pais.
11
+ """
12
+ texts_parent_splitted_list = []
13
+ texts_child_splitted_list = []
14
+
15
+ for text in texts:
16
+ texts_parent_splitted = markdown_split(
17
+ text["text_content"], headers_to_split_on)
18
+
19
+ for text_parent_splitted in texts_parent_splitted:
20
+ parent_id = uuid.uuid4()
21
+ text_parent_splitted_dict = {
22
+ "id": parent_id,
23
+ "parent_id": text["text_id"],
24
+ "content": text_parent_splitted,
25
+ "url": text["url"]
26
+ }
27
+
28
+ texts_parent_splitted_list.append(text_parent_splitted_dict)
29
+ texts_child_splitted = text_split(
30
+ text_parent_splitted.page_content)
31
+
32
+ for text_child_splitted in texts_child_splitted:
33
+ child_id = uuid.uuid4()
34
+ texts_child_splitted_dict = {
35
+ "id": child_id,
36
+ "parent_id": parent_id,
37
+ "content": text_child_splitted
38
+ }
39
+
40
+ texts_child_splitted_list.append(texts_child_splitted_dict)
41
+
42
+ return [texts_parent_splitted_list, texts_child_splitted_list]
43
+
44
+
45
+ def markdown_split(text, headers_to_split_on):
46
+ markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
47
+
48
+ return markdown_splitter.split_text(text)
49
+
50
+
51
+ def text_split(text):
52
+ text_splitter = RecursiveCharacterTextSplitter(
53
+ chunk_size=500,
54
+ chunk_overlap=100,
55
+ length_function=len,
56
+ is_separator_regex=False,
57
+ )
58
+
59
+ texts = text_splitter.create_documents([text])
60
+
61
+ return texts
agente/pipelines/utils/cli.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import timedelta
2
+ import itertools
3
+ import sys
4
+ import threading
5
+ import time
6
+
7
+
8
+ def loading_animation(done, text):
9
+ for c in itertools.cycle(['|', '/', '-', '\\']):
10
+ if done():
11
+ sys.stdout.write('\r')
12
+ sys.stdout.flush()
13
+ break
14
+ sys.stdout.write(f'\r{text} ' + c)
15
+ sys.stdout.flush()
16
+ time.sleep(0.1)
17
+
18
+
19
+ def start_loading_animation(done, text):
20
+ t = threading.Thread(
21
+ target=loading_animation, args=(lambda: done[0], text))
22
+ t.start()
23
+ return t
24
+
25
+
26
+ def stop_loading_animation(done, thread):
27
+ done[0] = True
28
+ thread.join()
29
+
30
+
31
+ def print_execution_time(start_time):
32
+ end_time = time.time()
33
+ execution_time = end_time - start_time
34
+
35
+ execution_time_td = timedelta(seconds=execution_time)
36
+
37
+ hours, remainder = divmod(execution_time_td.total_seconds(), 3600)
38
+ minutes, remainder = divmod(remainder, 60)
39
+ seconds, milliseconds = divmod(remainder, 1)
40
+ milliseconds *= 1000
41
+
42
+ return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}:{int(milliseconds):03}" # noqa
agente/pipelines/utils/markdown.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from oac.oac import get_description_of_image
3
+
4
+
5
+ def generate_image_description_with_empty_description(oa_client, websites):
6
+ # Expressão regular para encontrar o padrão ![](<link>)
7
+ pattern = r'!\[\]\(([^)]+)\)'
8
+
9
+ def replace_description_image(match):
10
+ link = match.group(1)
11
+ new_description = get_description_of_image(oa_client, link)
12
+ return f'![{new_description}]({link})'
13
+
14
+ texts_with_images_descriptions = []
15
+ error_images_upload = []
16
+ for website in websites:
17
+ try:
18
+ text_with_image_description = re.sub(
19
+ pattern,
20
+ replace_description_image,
21
+ website["text_content"]
22
+ )
23
+ texts_with_images_descriptions.append({
24
+ "id": website["text_id"],
25
+ "content": text_with_image_description,
26
+ "url": website["url"]
27
+ })
28
+ except Exception:
29
+ error_images_upload.append(
30
+ f"{website['text_id']}: {website['text_content']}")
31
+ texts_with_images_descriptions.append({
32
+ "id": website["text_id"],
33
+ "content": website["text_content"],
34
+ "url": website["url"]
35
+ })
36
+
37
+ return texts_with_images_descriptions, error_images_upload
agente/prompts/system.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an assistant designed to work as a FAQ for Brazilian Government Services. Be concise in your answers.
2
+
3
+ Your natural language is Brazilian Portuguese.
4
+
5
+ You will receive two labels in the user's question, one called 'Context' and the other 'Question'. Inside the 'Context', you will receive a JSON list in the following format, enclosed between three backticks.
6
+
7
+ ```
8
+ [
9
+ {
10
+ content: here you will receive the content in markdown format of the context that you will use to answer the user's question,
11
+ url: here is the URL from which this content was taken, and you should reference it in your answer
12
+ }
13
+ ]
14
+ ```
15
+
16
+ Since you will receive a list with these contexts, you will answer based on them what is inside the 'Question' label.
17
+
18
+ The format of your response should be in markdown. You can include images in your response by interpreting them based on the description provided for each image and responding in markdown format.
19
+
20
+ If you do not find the answer in the given context, simply respond to the user with "Não encontrei sua resposta."
21
+
22
+ Do not use information that is not in the given context.
agente/prompts/tamplates/.gitkeep ADDED
File without changes
agente/tools/.gitkeep ADDED
File without changes
agente/tools/__init__.py ADDED
File without changes
agente/tools/python/.gitkeep ADDED
File without changes
agente/utils/.gitkeep ADDED
File without changes
agente/utils/file.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ def File(file_path):
2
+ with open(file_path, 'r', encoding='utf-8') as file:
3
+ return file.read()
edgedb/dbschema/initial.esdl ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module default {
2
+ type Image {
3
+ name: str;
4
+ path: str;
5
+ url: str;
6
+ hyperlink: str;
7
+ alt: str;
8
+ }
9
+
10
+ type Video {
11
+ name: str;
12
+ url: str;
13
+ hyperlink: str;
14
+ alt: str;
15
+ }
16
+
17
+ type Text {
18
+ content: str;
19
+ }
20
+
21
+ type Website {
22
+ url: str;
23
+ relative_path: str;
24
+ hyperrefs: array<str>;
25
+ multi images: Image;
26
+ multi videos: Video;
27
+ text: Text;
28
+ segmented_texts: array<str>;
29
+ }
30
+
31
+ type Pattern {
32
+ content: str;
33
+ parent_id: uuid;
34
+ url: str;
35
+ }
36
+ }
edgedb/dbschema/migrations/00001-m14hvdp.edgeql ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CREATE MIGRATION m14hvdpwu2t2cxnf75otbhvkltgiwr5hqx7fh74t7rzysjw6fy34oa
2
+ ONTO initial
3
+ {
4
+ CREATE TYPE default::Image {
5
+ CREATE PROPERTY alt: std::str;
6
+ CREATE PROPERTY hyperlink: std::str;
7
+ CREATE PROPERTY name: std::str;
8
+ CREATE PROPERTY path: std::str;
9
+ CREATE PROPERTY url: std::str;
10
+ };
11
+ CREATE TYPE default::Text {
12
+ CREATE PROPERTY content: std::str;
13
+ };
14
+ CREATE TYPE default::Video {
15
+ CREATE PROPERTY alt: std::str;
16
+ CREATE PROPERTY hyperlink: std::str;
17
+ CREATE PROPERTY name: std::str;
18
+ CREATE PROPERTY url: std::str;
19
+ };
20
+ CREATE TYPE default::Website {
21
+ CREATE MULTI LINK images: default::Image;
22
+ CREATE LINK text: default::Text;
23
+ CREATE MULTI LINK videos: default::Video;
24
+ CREATE PROPERTY hyperrefs: array<std::str>;
25
+ CREATE PROPERTY relative_path: std::str;
26
+ CREATE PROPERTY segmented_texts: array<std::str>;
27
+ CREATE PROPERTY url: std::str;
28
+ };
29
+ };
edgedb/dbschema/migrations/00002-m1dsfjl.edgeql ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ CREATE MIGRATION m1dsfjlp6m7m6zym3olrmjirgmulk4m4vk2er7x22ohmbskv2kgbxa
2
+ ONTO m14hvdpwu2t2cxnf75otbhvkltgiwr5hqx7fh74t7rzysjw6fy34oa
3
+ {
4
+ CREATE TYPE default::Pattern {
5
+ CREATE PROPERTY content: std::str;
6
+ CREATE PROPERTY parent_id: std::uuid;
7
+ CREATE PROPERTY url: std::str;
8
+ };
9
+ };
edgedb/depopulate_edgedb.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import edgedb
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ client = edgedb.create_client()
7
+
8
+ resp = input('Are you sure you want to depopulate the database? (y/n) ')
9
+
10
+ if resp.lower() == 'y':
11
+ client.query('''
12
+ DELETE Website;
13
+ DELETE Image;
14
+ DELETE Video;
15
+ DELETE Text;
16
+ ''')
17
+ print('Database depopulated.')
18
+ else:
19
+ print('Operation canceled.')
edgedb/populate_edgedb.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import edgedb
2
+ import json
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ client = edgedb.create_client()
9
+
10
+ for root, dirs, files in os.walk("../Banco_de_Dados/Coleta/downloaded_files"):
11
+ for file in files:
12
+ if file.endswith(".json"):
13
+ with open(os.path.join(root, file), 'r') as f:
14
+ data = json.load(f)
15
+ data_path = './'+'/'.join(root.split('/')[5:])
16
+ # Insert Website
17
+ client.query('''
18
+ INSERT Website {
19
+ url := <str>$url,
20
+ relative_path := <str>$relative_path,
21
+ hyperrefs := <array<str>>$gov_links,
22
+ images := {},
23
+ videos := {},
24
+ text := (
25
+ INSERT Text {
26
+ content := <str>$content
27
+ }
28
+ )
29
+ };
30
+ ''', url=data['absolute_url'], \
31
+ relative_path=data_path, \
32
+ gov_links=data['gov_links'], \
33
+ content=data['text'])
34
+
35
+
36
+ # Insert Images
37
+ for image in data.get('images', []):
38
+ client.query('''
39
+ UPDATE Website
40
+ FILTER .url = <str>$url
41
+ SET {
42
+ images += {
43
+ (INSERT Image {
44
+ name := <str>$name,
45
+ path := <str>$path,
46
+ url := <str>$image_url,
47
+ hyperlink := <str>$hyperlink,
48
+ alt := <str>$alt,
49
+ })
50
+ }
51
+ };
52
+ ''', url=data['absolute_url'], \
53
+ path=image['path'], \
54
+ name=image['name'], \
55
+ image_url=image['url'], \
56
+ hyperlink=image['hyperlink'], \
57
+ alt=image['alt'])
58
+
59
+ for video in data.get('videos', []):
60
+ client.query('''
61
+ UPDATE Website
62
+ FILTER .url = <str>$url
63
+ SET {
64
+ videos += {
65
+ (INSERT Video {
66
+ name := <str>$name,
67
+ url := <str>$video_url,
68
+ hyperlink := <str>$hyperlink,
69
+ alt := <str>$alt,
70
+ })
71
+ }
72
+ };
73
+ ''', url=data['absolute_url'], \
74
+ name=video['name'], \
75
+ video_url=video['url'], \
76
+ hyperlink=video['hyperlink'], \
77
+ alt=video['alt'])