Spaces:

jspr
/

paperchat

Runtime error

App Files Files Community

paperchat / ingest.py

jspr

Upload 11 files

ac3b3f0 almost 2 years ago

raw

history blame contribute delete

2.83 kB

	"""Load html from files, clean up, split, ingest into Weaviate."""
	import os
	from pathlib import Path

	import weaviate
	from bs4 import BeautifulSoup
	from langchain.text_splitter import CharacterTextSplitter

	# def clean_data(data):
	# soup = BeautifulSoup(data)
	# text = soup.find_all("main", {"id": "main-content"})[0].get_text()
	# return "\n".join([t for t in text.split("\n") if t])

	# docs = []
	# metadatas = []
	# for p in Path("langchain.readthedocs.io/en/latest/").rglob("*"):
	# if p.is_dir():
	# continue
	# with open(p) as f:
	# docs.append(clean_data(f.read()))
	# metadatas.append({"source": p})

	with open('paper-dir/main.txt') as f:
	paper_text = f.read()

	docs = paper_text.split("§")
	# metadatas is the first word that comes after the section symbol
	metadatas = [doc.split(" ")[0] for doc in docs]

	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len,
	)

	documents = text_splitter.create_documents(docs, metadatas=metadatas)


	WEAVIATE_URL = os.environ["WEAVIATE_URL"]
	client = weaviate.Client(
	url=WEAVIATE_URL,
	additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
	)

	client.schema.delete_class("Paragraph")
	client.schema.get()
	schema = {
	"classes": [
	{
	"class": "Paragraph",
	"description": "A written paragraph",
	"vectorizer": "text2vec-openai",
	"moduleConfig": {
	"text2vec-openai": {
	"model": "ada",
	"modelVersion": "002",
	"type": "text",
	}
	},
	"properties": [
	{
	"dataType": ["text"],
	"description": "The content of the paragraph",
	"moduleConfig": {
	"text2vec-openai": {
	"skip": False,
	"vectorizePropertyName": False,
	}
	},
	"name": "content",
	},
	{
	"dataType": ["text"],
	"description": "The link",
	"moduleConfig": {
	"text2vec-openai": {
	"skip": True,
	"vectorizePropertyName": False,
	}
	},
	"name": "source",
	},
	],
	},
	]
	}

	client.schema.create(schema)

	with client.batch as batch:
	for text in documents:
	batch.add_data_object(
	{
	"content": text.page_content,
	"source": str(text.metadata["source"])
	},
	"Paragraph",
	)