Spaces:
Sleeping
Sleeping
Pecximenes
commited on
Commit
•
853a071
1
Parent(s):
9e44228
Adding v1 of rag-agent
Browse files- Dockerfile +14 -0
- agente/.env-example +18 -0
- agente/README.md +6 -0
- agente/WORKDIR/.gitkeep +0 -0
- agente/__init__.py +0 -0
- agente/agent.py +91 -0
- agente/interface/chatbot.py +150 -0
- agente/interface/main.py +91 -0
- agente/memory/.gitkeep +0 -0
- agente/pipelines/.gitkeep +0 -0
- agente/pipelines/__init__.py +0 -0
- agente/pipelines/message.py +24 -0
- agente/pipelines/oac/oac.py +33 -0
- agente/pipelines/rag.py +185 -0
- agente/pipelines/splitter/splitter.py +61 -0
- agente/pipelines/utils/cli.py +42 -0
- agente/pipelines/utils/markdown.py +37 -0
- agente/prompts/system.md +22 -0
- agente/prompts/tamplates/.gitkeep +0 -0
- agente/tools/.gitkeep +0 -0
- agente/tools/__init__.py +0 -0
- agente/tools/python/.gitkeep +0 -0
- agente/utils/.gitkeep +0 -0
- agente/utils/file.py +3 -0
- edgedb/dbschema/initial.esdl +36 -0
- edgedb/dbschema/migrations/00001-m14hvdp.edgeql +29 -0
- edgedb/dbschema/migrations/00002-m1dsfjl.edgeql +9 -0
- edgedb/depopulate_edgedb.py +19 -0
- edgedb/populate_edgedb.py +77 -0
Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.8-slim
|
2 |
+
|
3 |
+
ENV APP_HOME=/app
|
4 |
+
WORKDIR ${APP_HOME}
|
5 |
+
|
6 |
+
COPY requirements.txt ./
|
7 |
+
|
8 |
+
COPY ./agente ./
|
9 |
+
|
10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
11 |
+
|
12 |
+
EXPOSE 7860
|
13 |
+
|
14 |
+
CMD ["streamlit", "run", "interface/chatbot.py", "--server.port=7860"]
|
agente/.env-example
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
OPENAI_API_KEY="SUA-KEY"
|
2 |
+
OPENAI_MODEL_CHAT="gpt-3.5-turbo"
|
3 |
+
OPENAI_MODEL_EMBEDDING="text-embedding-3-small"
|
4 |
+
|
5 |
+
COLLECTION_NAME="sgd"
|
6 |
+
|
7 |
+
EDGEDB_HOST="localhost"
|
8 |
+
EDGEDB_USER="sgd"
|
9 |
+
EDGEDB_PASSWORD="secret"
|
10 |
+
EDGEDB_PORT="5656"
|
11 |
+
|
12 |
+
QDRANT_HOST="localhost"
|
13 |
+
QDRANT_PORT="6333"
|
14 |
+
|
15 |
+
REDIS_HOST="localhost"
|
16 |
+
REDIS_PORT="6379"
|
17 |
+
|
18 |
+
HUGGINGFACE_API_KEY="SUA-KEY"
|
agente/README.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
Para testar prototipo
|
3 |
+
|
4 |
+
```
|
5 |
+
streamlit run interface/main.py
|
6 |
+
```
|
agente/WORKDIR/.gitkeep
ADDED
File without changes
|
agente/__init__.py
ADDED
File without changes
|
agente/agent.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from openai import OpenAI
|
4 |
+
from qdrant_client import QdrantClient
|
5 |
+
from pipelines.message import send_message
|
6 |
+
import redis
|
7 |
+
|
8 |
+
conversation_chat = []
|
9 |
+
|
10 |
+
|
11 |
+
def run():
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
try:
|
15 |
+
oa_client = OpenAI(
|
16 |
+
api_key=os.environ.get("OPENAI_API_KEY")
|
17 |
+
)
|
18 |
+
print("✅ Conectado a OpenAI.")
|
19 |
+
|
20 |
+
qdrant_client = QdrantClient(
|
21 |
+
host=os.environ.get("QDRANT_HOST"),
|
22 |
+
port=os.environ.get("QDRANT_PORT")
|
23 |
+
)
|
24 |
+
print("✅ Conectado ao Qdrant.")
|
25 |
+
|
26 |
+
redis_client = redis.Redis(
|
27 |
+
host=os.environ.get("REDIS_HOST"),
|
28 |
+
port=os.environ.get("REDIS_PORT"),
|
29 |
+
decode_responses=True
|
30 |
+
)
|
31 |
+
print("✅ Conectado ao Redis.")
|
32 |
+
|
33 |
+
while True:
|
34 |
+
prompt = input("Digite sua pergunta: ")
|
35 |
+
|
36 |
+
embedding = oa_client.embeddings.create(
|
37 |
+
input=[prompt],
|
38 |
+
model=os.environ.get("OPENAI_MODEL_EMBEDDING")
|
39 |
+
).data[0].embedding
|
40 |
+
|
41 |
+
child_texts = qdrant_client.search(
|
42 |
+
collection_name=os.environ.get("COLLECTION_NAME"),
|
43 |
+
query_vector=embedding,
|
44 |
+
limit=3
|
45 |
+
)
|
46 |
+
|
47 |
+
print("--------- Child text ---------")
|
48 |
+
print(child_texts)
|
49 |
+
|
50 |
+
contexts = []
|
51 |
+
|
52 |
+
for child_text in child_texts:
|
53 |
+
parent_text = redis_client.hgetall(
|
54 |
+
child_text[0].payload["parent_id"]
|
55 |
+
)
|
56 |
+
context = {
|
57 |
+
"content": parent_text["content"],
|
58 |
+
"url": parent_text["url"]
|
59 |
+
}
|
60 |
+
contexts.append(context)
|
61 |
+
|
62 |
+
print("--------- Contexts ---------")
|
63 |
+
print(contexts)
|
64 |
+
|
65 |
+
stream_response = send_message(
|
66 |
+
oa_client,
|
67 |
+
context,
|
68 |
+
prompt,
|
69 |
+
conversation_chat
|
70 |
+
)
|
71 |
+
|
72 |
+
print("--------- Response Agent ---------")
|
73 |
+
response = ""
|
74 |
+
for chunk in stream_response:
|
75 |
+
if chunk.choices[0].delta.content is not None:
|
76 |
+
response += chunk.choices[0].delta.content
|
77 |
+
print(chunk.choices[0].delta.content, end="")
|
78 |
+
conversation_chat.append({
|
79 |
+
"role": "assistant",
|
80 |
+
"content": response
|
81 |
+
})
|
82 |
+
|
83 |
+
is_exit = input("\nDeseja sair? (s/n): ")
|
84 |
+
if is_exit == "s":
|
85 |
+
break
|
86 |
+
except Exception as error:
|
87 |
+
print(f"❌ Erro: {error}")
|
88 |
+
|
89 |
+
|
90 |
+
if __name__ == "__main__":
|
91 |
+
run()
|
agente/interface/chatbot.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
import edgedb
|
5 |
+
from openai import OpenAI
|
6 |
+
from qdrant_client import QdrantClient
|
7 |
+
import streamlit as st
|
8 |
+
|
9 |
+
# Add the parent directory to the Python path
|
10 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
11 |
+
|
12 |
+
from pipelines.message import send_message # noqa
|
13 |
+
|
14 |
+
st.set_page_config(page_title="Carlos AI Agent")
|
15 |
+
|
16 |
+
|
17 |
+
load_dotenv()
|
18 |
+
|
19 |
+
|
20 |
+
@st.cache_resource
|
21 |
+
def connect_to_services():
|
22 |
+
oa_client = OpenAI(
|
23 |
+
api_key=os.environ.get("OPENAI_API_KEY")
|
24 |
+
)
|
25 |
+
|
26 |
+
qdrant_client = QdrantClient(
|
27 |
+
url=os.environ.get("QDRANT_URL"),
|
28 |
+
api_key=os.environ.get("QDRANT_KEY")
|
29 |
+
)
|
30 |
+
|
31 |
+
edgedb_client = edgedb.create_client()
|
32 |
+
|
33 |
+
return oa_client, qdrant_client, edgedb_client
|
34 |
+
|
35 |
+
|
36 |
+
def send_message_for_ai(prompt, oa_client, qdrant_client, edgedb_client):
|
37 |
+
embedding = oa_client.embeddings.create(
|
38 |
+
input=[prompt],
|
39 |
+
model=os.environ.get("OPENAI_MODEL_EMBEDDING")
|
40 |
+
).data[0].embedding
|
41 |
+
|
42 |
+
child_texts = qdrant_client.search(
|
43 |
+
collection_name=os.environ.get("COLLECTION_NAME"),
|
44 |
+
query_vector=embedding,
|
45 |
+
limit=3
|
46 |
+
)
|
47 |
+
|
48 |
+
contexts = []
|
49 |
+
|
50 |
+
for child_text in child_texts:
|
51 |
+
parent_text = edgedb_client.query('''
|
52 |
+
SELECT Pattern {
|
53 |
+
content,
|
54 |
+
url,
|
55 |
+
parent_id
|
56 |
+
}
|
57 |
+
FILTER .id = <uuid>$parent_id
|
58 |
+
''', parent_id=child_text.payload["parent_id"])[0]
|
59 |
+
context = {
|
60 |
+
"content": parent_text.content,
|
61 |
+
"url": parent_text.url,
|
62 |
+
"parent_id": parent_text.parent_id
|
63 |
+
}
|
64 |
+
contexts.append(context)
|
65 |
+
|
66 |
+
# system_msg.write(
|
67 |
+
# f"""Contexto: {contexts}""")
|
68 |
+
|
69 |
+
stream_response = send_message(
|
70 |
+
oa_client,
|
71 |
+
contexts,
|
72 |
+
prompt,
|
73 |
+
[]
|
74 |
+
)
|
75 |
+
|
76 |
+
return stream_response
|
77 |
+
|
78 |
+
|
79 |
+
oa_client, qdrant_client, edgedb_client = connect_to_services()
|
80 |
+
|
81 |
+
|
82 |
+
def display_chat():
|
83 |
+
for msg in st.session_state.messages:
|
84 |
+
st.chat_message(msg["role"]).write(msg["content"])
|
85 |
+
|
86 |
+
|
87 |
+
def sidebar_content():
|
88 |
+
st.image('https://www.gov.br/++theme++padrao_govbr/img/govbr-logo-large.png', width=200)
|
89 |
+
st.header("Tópicos frequentes")
|
90 |
+
|
91 |
+
# Botões de exemplo
|
92 |
+
topics = [
|
93 |
+
"Niveis da conta govbr.",
|
94 |
+
"Dúvidas no reconhecimento facial.",
|
95 |
+
"Dúvidas na autenticação dos bancos.",
|
96 |
+
"Dúvidas para aumentar o nível com a cin."
|
97 |
+
]
|
98 |
+
|
99 |
+
for topic in topics:
|
100 |
+
st.button(topic)
|
101 |
+
|
102 |
+
# Espaços em branco para organização
|
103 |
+
for _ in range(5):
|
104 |
+
st.write("")
|
105 |
+
|
106 |
+
# Botão centralizado
|
107 |
+
col1, col2, col3 = st.columns([1, 1, 1])
|
108 |
+
with col2:
|
109 |
+
st.button("VOLTAR")
|
110 |
+
|
111 |
+
|
112 |
+
# Função principal de processamento de input do chat
|
113 |
+
def process_user_input():
|
114 |
+
if prompt := st.chat_input(placeholder="Digite sua mensagem"):
|
115 |
+
# Armazenar mensagem do usuário
|
116 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
117 |
+
st.chat_message("user").write(prompt)
|
118 |
+
|
119 |
+
# Simulação de resposta do assistente
|
120 |
+
response = send_message_for_ai(
|
121 |
+
prompt, oa_client, qdrant_client, edgedb_client)
|
122 |
+
|
123 |
+
# Armazenar e exibir resposta do assistente
|
124 |
+
st.session_state.messages.append({"role": "assistant", "content": response})
|
125 |
+
st.chat_message("assistant").write(response)
|
126 |
+
|
127 |
+
|
128 |
+
#system_msg = st.chat_message("system")
|
129 |
+
|
130 |
+
|
131 |
+
# Configuração inicial
|
132 |
+
if "messages" not in st.session_state:
|
133 |
+
st.session_state["messages"] = [{"role": "assistant", "content": "Como eu posso ajudar?"}]
|
134 |
+
|
135 |
+
# Exibição da barra lateral
|
136 |
+
with st.sidebar:
|
137 |
+
sidebar_content()
|
138 |
+
|
139 |
+
# Exibição do título e subtítulo
|
140 |
+
st.title("Bem-vindo à ajuda do gov.br")
|
141 |
+
st.caption("💬 Lorem ipsum dolor sit amet, consectetur adipiscing elit.")
|
142 |
+
|
143 |
+
# Exibição do chat
|
144 |
+
display_chat()
|
145 |
+
|
146 |
+
# Processamento da entrada do usuário no chat
|
147 |
+
process_user_input()
|
148 |
+
|
149 |
+
|
150 |
+
|
agente/interface/main.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from openai import OpenAI
|
5 |
+
from qdrant_client import QdrantClient
|
6 |
+
import redis
|
7 |
+
import streamlit as st
|
8 |
+
|
9 |
+
# Add the parent directory to the Python path
|
10 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
11 |
+
|
12 |
+
from pipelines.message import send_message # noqa
|
13 |
+
|
14 |
+
st.set_page_config(page_title="Carlos AI Agent")
|
15 |
+
|
16 |
+
|
17 |
+
load_dotenv()
|
18 |
+
|
19 |
+
|
20 |
+
@st.cache_resource
|
21 |
+
def connect_to_services():
|
22 |
+
oa_client = OpenAI(
|
23 |
+
api_key=os.environ.get("OPENAI_API_KEY")
|
24 |
+
)
|
25 |
+
|
26 |
+
qdrant_client = QdrantClient(
|
27 |
+
host=os.environ.get("QDRANT_HOST"),
|
28 |
+
port=os.environ.get("QDRANT_PORT")
|
29 |
+
)
|
30 |
+
|
31 |
+
redis_client = redis.Redis(
|
32 |
+
host=os.environ.get("REDIS_HOST"),
|
33 |
+
port=os.environ.get("REDIS_PORT"),
|
34 |
+
decode_responses=True
|
35 |
+
)
|
36 |
+
|
37 |
+
return oa_client, qdrant_client, redis_client
|
38 |
+
|
39 |
+
|
40 |
+
def send_message_for_ai(prompt, oa_client, qdrant_client, redis_client):
|
41 |
+
embedding = oa_client.embeddings.create(
|
42 |
+
input=[prompt],
|
43 |
+
model=os.environ.get("OPENAI_MODEL_EMBEDDING")
|
44 |
+
).data[0].embedding
|
45 |
+
|
46 |
+
child_texts = qdrant_client.search(
|
47 |
+
collection_name=os.environ.get("COLLECTION_NAME"),
|
48 |
+
query_vector=embedding,
|
49 |
+
limit=3
|
50 |
+
)
|
51 |
+
|
52 |
+
contexts = []
|
53 |
+
|
54 |
+
for child_text in child_texts:
|
55 |
+
parent_text = redis_client.hgetall(
|
56 |
+
child_text.payload["parent_id"]
|
57 |
+
)
|
58 |
+
context = {
|
59 |
+
"content": parent_text["content"],
|
60 |
+
"url": parent_text["url"],
|
61 |
+
"parent_id": parent_text["parent_id"]
|
62 |
+
}
|
63 |
+
contexts.append(context)
|
64 |
+
|
65 |
+
system_msg.write(
|
66 |
+
f"""Contexto: {contexts}""")
|
67 |
+
|
68 |
+
stream_response = send_message(
|
69 |
+
oa_client,
|
70 |
+
contexts,
|
71 |
+
prompt,
|
72 |
+
[]
|
73 |
+
)
|
74 |
+
|
75 |
+
return stream_response
|
76 |
+
|
77 |
+
|
78 |
+
oa_client, qdrant_client, redis_client = connect_to_services()
|
79 |
+
|
80 |
+
st.write("# Carlos AI Agent")
|
81 |
+
|
82 |
+
prompt = st.chat_input("Digite sua pergunta:")
|
83 |
+
user_msg = st.chat_message("user")
|
84 |
+
assistant_msg = st.chat_message("assistant")
|
85 |
+
system_msg = st.chat_message("system")
|
86 |
+
|
87 |
+
if prompt:
|
88 |
+
user_msg.write(prompt)
|
89 |
+
response = send_message_for_ai(
|
90 |
+
prompt, oa_client, qdrant_client, redis_client)
|
91 |
+
assistant_msg.write_stream(response)
|
agente/memory/.gitkeep
ADDED
File without changes
|
agente/pipelines/.gitkeep
ADDED
File without changes
|
agente/pipelines/__init__.py
ADDED
File without changes
|
agente/pipelines/message.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from openai import Client
|
3 |
+
from utils.file import File
|
4 |
+
|
5 |
+
|
6 |
+
def send_message(oa_client: Client, context, question, conversation_chat):
|
7 |
+
if len(conversation_chat) == 0:
|
8 |
+
system_message = File("prompts/system.md")
|
9 |
+
conversation_chat.append({
|
10 |
+
"role": "system",
|
11 |
+
"content": system_message
|
12 |
+
})
|
13 |
+
|
14 |
+
conversation_chat.append({
|
15 |
+
"role": "user",
|
16 |
+
"content": f"Context: {context}\nQuestion: {question}"
|
17 |
+
})
|
18 |
+
|
19 |
+
return oa_client.chat.completions.create(
|
20 |
+
model=os.environ.get("OPENAI_MODEL_CHAT"),
|
21 |
+
messages=conversation_chat,
|
22 |
+
stream=True
|
23 |
+
)
|
24 |
+
|
agente/pipelines/oac/oac.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import Client
|
2 |
+
|
3 |
+
|
4 |
+
def get_description_of_image(oa_client: Client, image_url):
|
5 |
+
completion = oa_client.chat.completions.create(
|
6 |
+
model="gpt-4o-mini",
|
7 |
+
messages=[
|
8 |
+
{
|
9 |
+
"role": "system",
|
10 |
+
"content": """
|
11 |
+
You are an image descriptor who will describe images using the Brazilian Portuguese language.
|
12 |
+
This description should be short and to the point, with a maximum of 50 characters, to be placed in the alt tag of an HTML.
|
13 |
+
""" # noqa
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"role": "user",
|
17 |
+
"content": [
|
18 |
+
{
|
19 |
+
"type": "text",
|
20 |
+
"text": "What's in this image?"
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"type": "image_url",
|
24 |
+
"image_url": {
|
25 |
+
"url": f"{image_url}"
|
26 |
+
}
|
27 |
+
}
|
28 |
+
]
|
29 |
+
}
|
30 |
+
],
|
31 |
+
max_tokens=300
|
32 |
+
)
|
33 |
+
return completion.choices[0].message.content
|
agente/pipelines/rag.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sqlite3
|
3 |
+
import time
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from openai import OpenAI
|
8 |
+
from qdrant_client import QdrantClient
|
9 |
+
|
10 |
+
from utils.markdown import generate_image_description_with_empty_description
|
11 |
+
from utils.cli import \
|
12 |
+
print_execution_time, start_loading_animation, stop_loading_animation
|
13 |
+
from splitter import splitter
|
14 |
+
from qdrant_client.models import Distance, VectorParams
|
15 |
+
from qdrant_client.models import PointStruct
|
16 |
+
|
17 |
+
|
18 |
+
HEADERS_TO_SPLIT_ON = [
|
19 |
+
("#", "Header 1"),
|
20 |
+
("##", "Header 2"),
|
21 |
+
("###", "Header 3"),
|
22 |
+
("####", "Header 4"),
|
23 |
+
("#####", "Header 5"),
|
24 |
+
("######", "Header 6"),
|
25 |
+
]
|
26 |
+
|
27 |
+
|
28 |
+
def safe_join(thread, done: List[bool]):
|
29 |
+
if thread is not None and thread.is_alive():
|
30 |
+
done[0] = True
|
31 |
+
thread.join()
|
32 |
+
|
33 |
+
|
34 |
+
def run():
|
35 |
+
loading_split = None
|
36 |
+
loading_image_description = None
|
37 |
+
loading_embedding = None
|
38 |
+
done_split = [False]
|
39 |
+
done_image_description = [False]
|
40 |
+
done_embeding = [False]
|
41 |
+
|
42 |
+
try:
|
43 |
+
load_dotenv()
|
44 |
+
|
45 |
+
start_time = time.time()
|
46 |
+
print("➗ Começando o RAG...")
|
47 |
+
|
48 |
+
conn = sqlite3.connect('../meu_banco.db')
|
49 |
+
cursor = conn.cursor()
|
50 |
+
print("✅ Conectado ao sqlite.")
|
51 |
+
|
52 |
+
qdrant_client = QdrantClient(":memory:")
|
53 |
+
print("✅ Conectado ao qdrant.")
|
54 |
+
|
55 |
+
oa_client = OpenAI(
|
56 |
+
api_key=os.environ.get("OPENAI_API_KEY")
|
57 |
+
)
|
58 |
+
print("✅ Conectado a OpenAI.")
|
59 |
+
|
60 |
+
if qdrant_client.collection_exists(os.environ.get("COLLECTION_NAME")):
|
61 |
+
qdrant_client.delete_collection(
|
62 |
+
collection_name=os.environ.get("COLLECTION_NAME")
|
63 |
+
)
|
64 |
+
qdrant_client.create_collection(
|
65 |
+
collection_name=os.environ.get("COLLECTION_NAME"),
|
66 |
+
vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
|
67 |
+
)
|
68 |
+
print("✅ Collection criada.")
|
69 |
+
|
70 |
+
cursor.execute('''
|
71 |
+
SELECT w.id, w.url, w.text_content, w.relative_path, w.hyperrefs,
|
72 |
+
i.name AS image_name, i.path AS image_path, i.url AS image_url, i.hyperlink AS image_hyperlink, i.alt AS image_alt,
|
73 |
+
v.name AS video_name, v.url AS video_url, v.hyperlink AS video_hyperlink, v.alt AS video_alt
|
74 |
+
FROM Website w
|
75 |
+
LEFT JOIN Image i ON i.website_id = w.id
|
76 |
+
LEFT JOIN Video v ON v.website_id = w.id
|
77 |
+
''')
|
78 |
+
web_sites_bruto = cursor.fetchall()
|
79 |
+
web_sites = []
|
80 |
+
for web_site_bruto in web_sites_bruto:
|
81 |
+
web_site = {
|
82 |
+
"text_content": web_site_bruto[2],
|
83 |
+
"text_id": web_site_bruto[0],
|
84 |
+
"url": web_site_bruto[1]
|
85 |
+
}
|
86 |
+
web_sites.append(web_site)
|
87 |
+
|
88 |
+
loading_image_description = start_loading_animation(
|
89 |
+
done_image_description,
|
90 |
+
"Gerando Descrições de imagens caso não tenha..."
|
91 |
+
)
|
92 |
+
|
93 |
+
texts_with_images_descriptions, error_images_description = \
|
94 |
+
generate_image_description_with_empty_description(
|
95 |
+
oa_client,
|
96 |
+
web_sites
|
97 |
+
)
|
98 |
+
stop_loading_animation(
|
99 |
+
done_image_description,
|
100 |
+
loading_image_description
|
101 |
+
)
|
102 |
+
print("✅ Descrição de imagens geradas.")
|
103 |
+
|
104 |
+
texts_parent_splitted, texts_child_splitted = splitter.split(
|
105 |
+
texts_with_images_descriptions,
|
106 |
+
HEADERS_TO_SPLIT_ON
|
107 |
+
)
|
108 |
+
print("✅ Textos dividos.")
|
109 |
+
|
110 |
+
loading_split = start_loading_animation(
|
111 |
+
done_split,
|
112 |
+
"Dividindo textos..."
|
113 |
+
)
|
114 |
+
for text_parent_splitted in texts_parent_splitted:
|
115 |
+
cursor.execute('''
|
116 |
+
CREATE TABLE IF NOT EXISTS ParentText
|
117 |
+
(
|
118 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
119 |
+
content TEXT,
|
120 |
+
parent_id INTEGER,
|
121 |
+
url TEXT,
|
122 |
+
FOREIGN KEY (parent_id) REFERENCES Website(id)
|
123 |
+
);
|
124 |
+
''')
|
125 |
+
cursor.execute('''
|
126 |
+
INSERT INTO ParentText (content, parent_id, url)
|
127 |
+
VALUES (?, ?, ?);
|
128 |
+
''', (
|
129 |
+
text_parent_splitted["content"].page_content,
|
130 |
+
text_parent_splitted["parent_id"],
|
131 |
+
text_parent_splitted["url"]
|
132 |
+
))
|
133 |
+
stop_loading_animation(done_split, loading_split)
|
134 |
+
print("✅ Texto pai salvo no sqlite.")
|
135 |
+
|
136 |
+
count_embeddings = 0
|
137 |
+
total_child_splitted = len(texts_child_splitted)
|
138 |
+
for text_child_splitted in texts_child_splitted:
|
139 |
+
done_embeding = [False]
|
140 |
+
loading_embedding = start_loading_animation(
|
141 |
+
done_embeding,
|
142 |
+
f"""Gerando embeddings e salvando no qdrant: {count_embeddings} de {total_child_splitted}""" # noqa
|
143 |
+
)
|
144 |
+
embedding = oa_client.embeddings.create(
|
145 |
+
input=[text_child_splitted["content"].page_content],
|
146 |
+
model=os.environ.get("OPENAI_MODEL_EMBEDDING")
|
147 |
+
).data[0].embedding
|
148 |
+
qdrant_client.upsert(
|
149 |
+
collection_name=os.environ.get("COLLECTION_NAME"),
|
150 |
+
points=[PointStruct(
|
151 |
+
id=str(text_child_splitted["id"]),
|
152 |
+
vector=embedding,
|
153 |
+
payload={
|
154 |
+
"content": text_child_splitted["content"].page_content,
|
155 |
+
"parent_id": text_child_splitted["parent_id"],
|
156 |
+
"type": "text"
|
157 |
+
}
|
158 |
+
)]
|
159 |
+
)
|
160 |
+
count_embeddings += 1
|
161 |
+
stop_loading_animation(done_embeding, loading_embedding)
|
162 |
+
print("✅ Texto filho salvo no qdrant.")
|
163 |
+
|
164 |
+
print("✅ RAG finalizado.")
|
165 |
+
|
166 |
+
print(
|
167 |
+
f"""
|
168 |
+
📊 Relatório:\n
|
169 |
+
\t Tempo de execução: {print_execution_time(start_time)}\n
|
170 |
+
\t Textos Filhos e Embedding gerados: {len(texts_child_splitted)}\n
|
171 |
+
\t Textos pai gerados: {len(texts_parent_splitted)}\n
|
172 |
+
\t WebSites recuperados da base: {len(web_sites)}\n
|
173 |
+
\t Erros ao gerar descrição de imagens: {len(error_images_description)}\n
|
174 |
+
""") # noqa
|
175 |
+
|
176 |
+
except Exception as error:
|
177 |
+
print(f"❌ Erro: {error}")
|
178 |
+
finally:
|
179 |
+
safe_join(loading_split, done_split)
|
180 |
+
safe_join(loading_image_description, done_image_description)
|
181 |
+
safe_join(loading_embedding, done_embeding)
|
182 |
+
|
183 |
+
|
184 |
+
if __name__ == "__main__":
|
185 |
+
run()
|
agente/pipelines/splitter/splitter.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_text_splitters import \
|
2 |
+
MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
|
3 |
+
import uuid
|
4 |
+
|
5 |
+
|
6 |
+
def split(texts, headers_to_split_on):
|
7 |
+
"""
|
8 |
+
Dividir os textos em chunks utilizando a lógica
|
9 |
+
"Parent Document Retriever",
|
10 |
+
retornando os chunks e os documentos pais.
|
11 |
+
"""
|
12 |
+
texts_parent_splitted_list = []
|
13 |
+
texts_child_splitted_list = []
|
14 |
+
|
15 |
+
for text in texts:
|
16 |
+
texts_parent_splitted = markdown_split(
|
17 |
+
text["text_content"], headers_to_split_on)
|
18 |
+
|
19 |
+
for text_parent_splitted in texts_parent_splitted:
|
20 |
+
parent_id = uuid.uuid4()
|
21 |
+
text_parent_splitted_dict = {
|
22 |
+
"id": parent_id,
|
23 |
+
"parent_id": text["text_id"],
|
24 |
+
"content": text_parent_splitted,
|
25 |
+
"url": text["url"]
|
26 |
+
}
|
27 |
+
|
28 |
+
texts_parent_splitted_list.append(text_parent_splitted_dict)
|
29 |
+
texts_child_splitted = text_split(
|
30 |
+
text_parent_splitted.page_content)
|
31 |
+
|
32 |
+
for text_child_splitted in texts_child_splitted:
|
33 |
+
child_id = uuid.uuid4()
|
34 |
+
texts_child_splitted_dict = {
|
35 |
+
"id": child_id,
|
36 |
+
"parent_id": parent_id,
|
37 |
+
"content": text_child_splitted
|
38 |
+
}
|
39 |
+
|
40 |
+
texts_child_splitted_list.append(texts_child_splitted_dict)
|
41 |
+
|
42 |
+
return [texts_parent_splitted_list, texts_child_splitted_list]
|
43 |
+
|
44 |
+
|
45 |
+
def markdown_split(text, headers_to_split_on):
|
46 |
+
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
|
47 |
+
|
48 |
+
return markdown_splitter.split_text(text)
|
49 |
+
|
50 |
+
|
51 |
+
def text_split(text):
|
52 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
53 |
+
chunk_size=500,
|
54 |
+
chunk_overlap=100,
|
55 |
+
length_function=len,
|
56 |
+
is_separator_regex=False,
|
57 |
+
)
|
58 |
+
|
59 |
+
texts = text_splitter.create_documents([text])
|
60 |
+
|
61 |
+
return texts
|
agente/pipelines/utils/cli.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import timedelta
|
2 |
+
import itertools
|
3 |
+
import sys
|
4 |
+
import threading
|
5 |
+
import time
|
6 |
+
|
7 |
+
|
8 |
+
def loading_animation(done, text):
|
9 |
+
for c in itertools.cycle(['|', '/', '-', '\\']):
|
10 |
+
if done():
|
11 |
+
sys.stdout.write('\r')
|
12 |
+
sys.stdout.flush()
|
13 |
+
break
|
14 |
+
sys.stdout.write(f'\r{text} ' + c)
|
15 |
+
sys.stdout.flush()
|
16 |
+
time.sleep(0.1)
|
17 |
+
|
18 |
+
|
19 |
+
def start_loading_animation(done, text):
|
20 |
+
t = threading.Thread(
|
21 |
+
target=loading_animation, args=(lambda: done[0], text))
|
22 |
+
t.start()
|
23 |
+
return t
|
24 |
+
|
25 |
+
|
26 |
+
def stop_loading_animation(done, thread):
|
27 |
+
done[0] = True
|
28 |
+
thread.join()
|
29 |
+
|
30 |
+
|
31 |
+
def print_execution_time(start_time):
|
32 |
+
end_time = time.time()
|
33 |
+
execution_time = end_time - start_time
|
34 |
+
|
35 |
+
execution_time_td = timedelta(seconds=execution_time)
|
36 |
+
|
37 |
+
hours, remainder = divmod(execution_time_td.total_seconds(), 3600)
|
38 |
+
minutes, remainder = divmod(remainder, 60)
|
39 |
+
seconds, milliseconds = divmod(remainder, 1)
|
40 |
+
milliseconds *= 1000
|
41 |
+
|
42 |
+
return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}:{int(milliseconds):03}" # noqa
|
agente/pipelines/utils/markdown.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from oac.oac import get_description_of_image
|
3 |
+
|
4 |
+
|
5 |
+
def generate_image_description_with_empty_description(oa_client, websites):
|
6 |
+
# Expressão regular para encontrar o padrão ![](<link>)
|
7 |
+
pattern = r'!\[\]\(([^)]+)\)'
|
8 |
+
|
9 |
+
def replace_description_image(match):
|
10 |
+
link = match.group(1)
|
11 |
+
new_description = get_description_of_image(oa_client, link)
|
12 |
+
return f'![{new_description}]({link})'
|
13 |
+
|
14 |
+
texts_with_images_descriptions = []
|
15 |
+
error_images_upload = []
|
16 |
+
for website in websites:
|
17 |
+
try:
|
18 |
+
text_with_image_description = re.sub(
|
19 |
+
pattern,
|
20 |
+
replace_description_image,
|
21 |
+
website["text_content"]
|
22 |
+
)
|
23 |
+
texts_with_images_descriptions.append({
|
24 |
+
"id": website["text_id"],
|
25 |
+
"content": text_with_image_description,
|
26 |
+
"url": website["url"]
|
27 |
+
})
|
28 |
+
except Exception:
|
29 |
+
error_images_upload.append(
|
30 |
+
f"{website['text_id']}: {website['text_content']}")
|
31 |
+
texts_with_images_descriptions.append({
|
32 |
+
"id": website["text_id"],
|
33 |
+
"content": website["text_content"],
|
34 |
+
"url": website["url"]
|
35 |
+
})
|
36 |
+
|
37 |
+
return texts_with_images_descriptions, error_images_upload
|
agente/prompts/system.md
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are an assistant designed to work as a FAQ for Brazilian Government Services. Be concise in your answers.
|
2 |
+
|
3 |
+
Your natural language is Brazilian Portuguese.
|
4 |
+
|
5 |
+
You will receive two labels in the user's question, one called 'Context' and the other 'Question'. Inside the 'Context', you will receive a JSON list in the following format, enclosed between three backticks.
|
6 |
+
|
7 |
+
```
|
8 |
+
[
|
9 |
+
{
|
10 |
+
content: here you will receive the content in markdown format of the context that you will use to answer the user's question,
|
11 |
+
url: here is the URL from which this content was taken, and you should reference it in your answer
|
12 |
+
}
|
13 |
+
]
|
14 |
+
```
|
15 |
+
|
16 |
+
Since you will receive a list with these contexts, you will answer based on them what is inside the 'Question' label.
|
17 |
+
|
18 |
+
The format of your response should be in markdown. You can include images in your response by interpreting them based on the description provided for each image and responding in markdown format.
|
19 |
+
|
20 |
+
If you do not find the answer in the given context, simply respond to the user with "Não encontrei sua resposta."
|
21 |
+
|
22 |
+
Do not use information that is not in the given context.
|
agente/prompts/tamplates/.gitkeep
ADDED
File without changes
|
agente/tools/.gitkeep
ADDED
File without changes
|
agente/tools/__init__.py
ADDED
File without changes
|
agente/tools/python/.gitkeep
ADDED
File without changes
|
agente/utils/.gitkeep
ADDED
File without changes
|
agente/utils/file.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
def File(file_path):
|
2 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
3 |
+
return file.read()
|
edgedb/dbschema/initial.esdl
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module default {
|
2 |
+
type Image {
|
3 |
+
name: str;
|
4 |
+
path: str;
|
5 |
+
url: str;
|
6 |
+
hyperlink: str;
|
7 |
+
alt: str;
|
8 |
+
}
|
9 |
+
|
10 |
+
type Video {
|
11 |
+
name: str;
|
12 |
+
url: str;
|
13 |
+
hyperlink: str;
|
14 |
+
alt: str;
|
15 |
+
}
|
16 |
+
|
17 |
+
type Text {
|
18 |
+
content: str;
|
19 |
+
}
|
20 |
+
|
21 |
+
type Website {
|
22 |
+
url: str;
|
23 |
+
relative_path: str;
|
24 |
+
hyperrefs: array<str>;
|
25 |
+
multi images: Image;
|
26 |
+
multi videos: Video;
|
27 |
+
text: Text;
|
28 |
+
segmented_texts: array<str>;
|
29 |
+
}
|
30 |
+
|
31 |
+
type Pattern {
|
32 |
+
content: str;
|
33 |
+
parent_id: uuid;
|
34 |
+
url: str;
|
35 |
+
}
|
36 |
+
}
|
edgedb/dbschema/migrations/00001-m14hvdp.edgeql
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CREATE MIGRATION m14hvdpwu2t2cxnf75otbhvkltgiwr5hqx7fh74t7rzysjw6fy34oa
|
2 |
+
ONTO initial
|
3 |
+
{
|
4 |
+
CREATE TYPE default::Image {
|
5 |
+
CREATE PROPERTY alt: std::str;
|
6 |
+
CREATE PROPERTY hyperlink: std::str;
|
7 |
+
CREATE PROPERTY name: std::str;
|
8 |
+
CREATE PROPERTY path: std::str;
|
9 |
+
CREATE PROPERTY url: std::str;
|
10 |
+
};
|
11 |
+
CREATE TYPE default::Text {
|
12 |
+
CREATE PROPERTY content: std::str;
|
13 |
+
};
|
14 |
+
CREATE TYPE default::Video {
|
15 |
+
CREATE PROPERTY alt: std::str;
|
16 |
+
CREATE PROPERTY hyperlink: std::str;
|
17 |
+
CREATE PROPERTY name: std::str;
|
18 |
+
CREATE PROPERTY url: std::str;
|
19 |
+
};
|
20 |
+
CREATE TYPE default::Website {
|
21 |
+
CREATE MULTI LINK images: default::Image;
|
22 |
+
CREATE LINK text: default::Text;
|
23 |
+
CREATE MULTI LINK videos: default::Video;
|
24 |
+
CREATE PROPERTY hyperrefs: array<std::str>;
|
25 |
+
CREATE PROPERTY relative_path: std::str;
|
26 |
+
CREATE PROPERTY segmented_texts: array<std::str>;
|
27 |
+
CREATE PROPERTY url: std::str;
|
28 |
+
};
|
29 |
+
};
|
edgedb/dbschema/migrations/00002-m1dsfjl.edgeql
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CREATE MIGRATION m1dsfjlp6m7m6zym3olrmjirgmulk4m4vk2er7x22ohmbskv2kgbxa
|
2 |
+
ONTO m14hvdpwu2t2cxnf75otbhvkltgiwr5hqx7fh74t7rzysjw6fy34oa
|
3 |
+
{
|
4 |
+
CREATE TYPE default::Pattern {
|
5 |
+
CREATE PROPERTY content: std::str;
|
6 |
+
CREATE PROPERTY parent_id: std::uuid;
|
7 |
+
CREATE PROPERTY url: std::str;
|
8 |
+
};
|
9 |
+
};
|
edgedb/depopulate_edgedb.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import edgedb
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
load_dotenv()
|
5 |
+
|
6 |
+
client = edgedb.create_client()
|
7 |
+
|
8 |
+
resp = input('Are you sure you want to depopulate the database? (y/n) ')
|
9 |
+
|
10 |
+
if resp.lower() == 'y':
|
11 |
+
client.query('''
|
12 |
+
DELETE Website;
|
13 |
+
DELETE Image;
|
14 |
+
DELETE Video;
|
15 |
+
DELETE Text;
|
16 |
+
''')
|
17 |
+
print('Database depopulated.')
|
18 |
+
else:
|
19 |
+
print('Operation canceled.')
|
edgedb/populate_edgedb.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import edgedb
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
client = edgedb.create_client()
|
9 |
+
|
10 |
+
for root, dirs, files in os.walk("../Banco_de_Dados/Coleta/downloaded_files"):
|
11 |
+
for file in files:
|
12 |
+
if file.endswith(".json"):
|
13 |
+
with open(os.path.join(root, file), 'r') as f:
|
14 |
+
data = json.load(f)
|
15 |
+
data_path = './'+'/'.join(root.split('/')[5:])
|
16 |
+
# Insert Website
|
17 |
+
client.query('''
|
18 |
+
INSERT Website {
|
19 |
+
url := <str>$url,
|
20 |
+
relative_path := <str>$relative_path,
|
21 |
+
hyperrefs := <array<str>>$gov_links,
|
22 |
+
images := {},
|
23 |
+
videos := {},
|
24 |
+
text := (
|
25 |
+
INSERT Text {
|
26 |
+
content := <str>$content
|
27 |
+
}
|
28 |
+
)
|
29 |
+
};
|
30 |
+
''', url=data['absolute_url'], \
|
31 |
+
relative_path=data_path, \
|
32 |
+
gov_links=data['gov_links'], \
|
33 |
+
content=data['text'])
|
34 |
+
|
35 |
+
|
36 |
+
# Insert Images
|
37 |
+
for image in data.get('images', []):
|
38 |
+
client.query('''
|
39 |
+
UPDATE Website
|
40 |
+
FILTER .url = <str>$url
|
41 |
+
SET {
|
42 |
+
images += {
|
43 |
+
(INSERT Image {
|
44 |
+
name := <str>$name,
|
45 |
+
path := <str>$path,
|
46 |
+
url := <str>$image_url,
|
47 |
+
hyperlink := <str>$hyperlink,
|
48 |
+
alt := <str>$alt,
|
49 |
+
})
|
50 |
+
}
|
51 |
+
};
|
52 |
+
''', url=data['absolute_url'], \
|
53 |
+
path=image['path'], \
|
54 |
+
name=image['name'], \
|
55 |
+
image_url=image['url'], \
|
56 |
+
hyperlink=image['hyperlink'], \
|
57 |
+
alt=image['alt'])
|
58 |
+
|
59 |
+
for video in data.get('videos', []):
|
60 |
+
client.query('''
|
61 |
+
UPDATE Website
|
62 |
+
FILTER .url = <str>$url
|
63 |
+
SET {
|
64 |
+
videos += {
|
65 |
+
(INSERT Video {
|
66 |
+
name := <str>$name,
|
67 |
+
url := <str>$video_url,
|
68 |
+
hyperlink := <str>$hyperlink,
|
69 |
+
alt := <str>$alt,
|
70 |
+
})
|
71 |
+
}
|
72 |
+
};
|
73 |
+
''', url=data['absolute_url'], \
|
74 |
+
name=video['name'], \
|
75 |
+
video_url=video['url'], \
|
76 |
+
hyperlink=video['hyperlink'], \
|
77 |
+
alt=video['alt'])
|