Pecximenes's picture
Removing diretory and organazing files
94e320d
raw
history blame
6.48 kB
import os
import sqlite3
import time
from typing import List
from dotenv import load_dotenv
from openai import OpenAI
from qdrant_client import QdrantClient
from utils.markdown import generate_image_description_with_empty_description
from utils.cli import \
print_execution_time, start_loading_animation, stop_loading_animation
from splitter import splitter
from qdrant_client.models import Distance, VectorParams
from qdrant_client.models import PointStruct
HEADERS_TO_SPLIT_ON = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
("####", "Header 4"),
("#####", "Header 5"),
("######", "Header 6"),
]
def safe_join(thread, done: List[bool]):
if thread is not None and thread.is_alive():
done[0] = True
thread.join()
def run():
loading_split = None
loading_image_description = None
loading_embedding = None
done_split = [False]
done_image_description = [False]
done_embeding = [False]
try:
load_dotenv()
start_time = time.time()
print("➗ Começando o RAG...")
conn = sqlite3.connect('../meu_banco.db')
cursor = conn.cursor()
print("✅ Conectado ao sqlite.")
qdrant_client = QdrantClient(":memory:")
print("✅ Conectado ao qdrant.")
oa_client = OpenAI(
api_key=os.environ.get("OPENAI_API_KEY")
)
print("✅ Conectado a OpenAI.")
if qdrant_client.collection_exists(os.environ.get("COLLECTION_NAME")):
qdrant_client.delete_collection(
collection_name=os.environ.get("COLLECTION_NAME")
)
qdrant_client.create_collection(
collection_name=os.environ.get("COLLECTION_NAME"),
vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)
print("✅ Collection criada.")
cursor.execute('''
SELECT w.id, w.url, w.text_content, w.relative_path, w.hyperrefs,
i.name AS image_name, i.path AS image_path, i.url AS image_url, i.hyperlink AS image_hyperlink, i.alt AS image_alt,
v.name AS video_name, v.url AS video_url, v.hyperlink AS video_hyperlink, v.alt AS video_alt
FROM Website w
LEFT JOIN Image i ON i.website_id = w.id
LEFT JOIN Video v ON v.website_id = w.id
''')
web_sites_bruto = cursor.fetchall()
web_sites = []
for web_site_bruto in web_sites_bruto:
web_site = {
"text_content": web_site_bruto[2],
"text_id": web_site_bruto[0],
"url": web_site_bruto[1]
}
web_sites.append(web_site)
loading_image_description = start_loading_animation(
done_image_description,
"Gerando Descrições de imagens caso não tenha..."
)
texts_with_images_descriptions, error_images_description = \
generate_image_description_with_empty_description(
oa_client,
web_sites
)
stop_loading_animation(
done_image_description,
loading_image_description
)
print("✅ Descrição de imagens geradas.")
texts_parent_splitted, texts_child_splitted = splitter.split(
texts_with_images_descriptions,
HEADERS_TO_SPLIT_ON
)
print("✅ Textos dividos.")
loading_split = start_loading_animation(
done_split,
"Dividindo textos..."
)
for text_parent_splitted in texts_parent_splitted:
cursor.execute('''
CREATE TABLE IF NOT EXISTS ParentText
(
id INTEGER PRIMARY KEY AUTOINCREMENT,
content TEXT,
parent_id INTEGER,
url TEXT,
FOREIGN KEY (parent_id) REFERENCES Website(id)
);
''')
cursor.execute('''
INSERT INTO ParentText (content, parent_id, url)
VALUES (?, ?, ?);
''', (
text_parent_splitted["content"].page_content,
text_parent_splitted["parent_id"],
text_parent_splitted["url"]
))
stop_loading_animation(done_split, loading_split)
print("✅ Texto pai salvo no sqlite.")
count_embeddings = 0
total_child_splitted = len(texts_child_splitted)
for text_child_splitted in texts_child_splitted:
done_embeding = [False]
loading_embedding = start_loading_animation(
done_embeding,
f"""Gerando embeddings e salvando no qdrant: {count_embeddings} de {total_child_splitted}""" # noqa
)
embedding = oa_client.embeddings.create(
input=[text_child_splitted["content"].page_content],
model=os.environ.get("OPENAI_MODEL_EMBEDDING")
).data[0].embedding
qdrant_client.upsert(
collection_name=os.environ.get("COLLECTION_NAME"),
points=[PointStruct(
id=str(text_child_splitted["id"]),
vector=embedding,
payload={
"content": text_child_splitted["content"].page_content,
"parent_id": text_child_splitted["parent_id"],
"type": "text"
}
)]
)
count_embeddings += 1
stop_loading_animation(done_embeding, loading_embedding)
print("✅ Texto filho salvo no qdrant.")
print("✅ RAG finalizado.")
print(
f"""
📊 Relatório:\n
\t Tempo de execução: {print_execution_time(start_time)}\n
\t Textos Filhos e Embedding gerados: {len(texts_child_splitted)}\n
\t Textos pai gerados: {len(texts_parent_splitted)}\n
\t WebSites recuperados da base: {len(web_sites)}\n
\t Erros ao gerar descrição de imagens: {len(error_images_description)}\n
""") # noqa
except Exception as error:
print(f"❌ Erro: {error}")
finally:
safe_join(loading_split, done_split)
safe_join(loading_image_description, done_image_description)
safe_join(loading_embedding, done_embeding)
if __name__ == "__main__":
run()