Spaces:

LordFarquaad42
/

Groove-GPT

Sleeping

App Files Files Community

Groove-GPT / add_data.py

LordFarquaad42

fixed bug where create_client loaded client

c796de9 7 months ago

raw

history blame

3.24 kB

	import chromadb
	from chromadb.utils import embedding_functions


	def create_client():
	client = chromadb.PersistentClient(path="./chromadb_linux/")
	MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
	COLLECTION_NAME: str = "schemer2"
	EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
	model_name=MODEL_NAME
	)
	schemer = client.create_collection(
	name=COLLECTION_NAME,
	embedding_function=EMBEDDING_FUNC,
	)
	return schemer

	def get_client():
	client = chromadb.PersistentClient(path="./chromadb_linux/")
	MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
	COLLECTION_NAME: str = "scheme"
	EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
	model_name=MODEL_NAME
	)
	schemer = client.get_collection(
	name=COLLECTION_NAME,
	embedding_function=EMBEDDING_FUNC,
	)
	return schemer


	def update_collection(iter: int, text: object, client: chromadb.Collection):
	client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)])


	def encode_image(image) -> str:
	import io
	import base64

	byte_arr = io.BytesIO()
	image.save(byte_arr, format="JPEG")
	encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
	return encoded_image


	async def image_to_text(image) -> object:
	from openai import OpenAI
	import json

	client = OpenAI()

	response = client.chat.completions.create(
	model="gpt-4-turbo",
	response_format={"type": "json_object"},
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image"},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64;,{image}",
	"detail": "high",
	},
	},
	],
	}
	],
	)
	return json.loads(response.choices[0].message.content)


	async def start_troggin_off(dir: str, client):
	# recursive
	import os
	from pdf2image import convert_from_path

	dirs = os.listdir(dir)
	for path in dirs:
	if os.path.isdir(os.path.join(dir, path)):
	await start_troggin_off(os.path.join(dir, path), client) # recursive call

	if(os.path.join(dir, path).endswith(".pdf")):
	images = convert_from_path(os.path.join(dir, path))

	for i, image in enumerate(images):
	encoded_image = encode_image(image)
	text = await image_to_text(encoded_image)
	update_collection(i, text, client)

	if __name__ == "__main__":
	import asyncio
	client = create_client()
	# client = None
	asyncio.run(start_troggin_off("data/", client))