|
"""Load html from files, clean up, split, ingest into Weaviate.""" |
|
import os |
|
from pathlib import Path |
|
|
|
import weaviate |
|
from bs4 import BeautifulSoup |
|
from langchain.text_splitter import CharacterTextSplitter |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open('paper-dir/main.txt') as f: |
|
paper_text = f.read() |
|
|
|
docs = paper_text.split("§") |
|
|
|
metadatas = [doc.split(" ")[0] for doc in docs] |
|
|
|
text_splitter = CharacterTextSplitter( |
|
separator="\n", |
|
chunk_size=1000, |
|
chunk_overlap=200, |
|
length_function=len, |
|
) |
|
|
|
documents = text_splitter.create_documents(docs, metadatas=metadatas) |
|
|
|
|
|
WEAVIATE_URL = os.environ["WEAVIATE_URL"] |
|
client = weaviate.Client( |
|
url=WEAVIATE_URL, |
|
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]}, |
|
) |
|
|
|
client.schema.delete_class("Paragraph") |
|
client.schema.get() |
|
schema = { |
|
"classes": [ |
|
{ |
|
"class": "Paragraph", |
|
"description": "A written paragraph", |
|
"vectorizer": "text2vec-openai", |
|
"moduleConfig": { |
|
"text2vec-openai": { |
|
"model": "ada", |
|
"modelVersion": "002", |
|
"type": "text", |
|
} |
|
}, |
|
"properties": [ |
|
{ |
|
"dataType": ["text"], |
|
"description": "The content of the paragraph", |
|
"moduleConfig": { |
|
"text2vec-openai": { |
|
"skip": False, |
|
"vectorizePropertyName": False, |
|
} |
|
}, |
|
"name": "content", |
|
}, |
|
{ |
|
"dataType": ["text"], |
|
"description": "The link", |
|
"moduleConfig": { |
|
"text2vec-openai": { |
|
"skip": True, |
|
"vectorizePropertyName": False, |
|
} |
|
}, |
|
"name": "source", |
|
}, |
|
], |
|
}, |
|
] |
|
} |
|
|
|
client.schema.create(schema) |
|
|
|
with client.batch as batch: |
|
for text in documents: |
|
batch.add_data_object( |
|
{ |
|
"content": text.page_content, |
|
"source": str(text.metadata["source"]) |
|
}, |
|
"Paragraph", |
|
) |
|
|