Spaces:
Sleeping
Sleeping
import chromadb | |
from chromadb.utils import embedding_functions | |
def create_client(): | |
client = chromadb.PersistentClient(path="./chromadb_linux/") | |
MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb | |
COLLECTION_NAME: str = "schemer2" | |
EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction( | |
model_name=MODEL_NAME | |
) | |
schemer = client.create_collection( | |
name=COLLECTION_NAME, | |
embedding_function=EMBEDDING_FUNC, | |
) | |
return schemer | |
def get_client(): | |
client = chromadb.PersistentClient(path="./chromadb_linux/") | |
MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb | |
COLLECTION_NAME: str = "scheme" | |
EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction( | |
model_name=MODEL_NAME | |
) | |
schemer = client.get_collection( | |
name=COLLECTION_NAME, | |
embedding_function=EMBEDDING_FUNC, | |
) | |
return schemer | |
def update_collection(iter: int, text: object, client: chromadb.Collection): | |
client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)]) | |
def encode_image(image) -> str: | |
import io | |
import base64 | |
byte_arr = io.BytesIO() | |
image.save(byte_arr, format="JPEG") | |
encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8") | |
return encoded_image | |
async def image_to_text(image) -> object: | |
from openai import OpenAI | |
import json | |
client = OpenAI() | |
response = client.chat.completions.create( | |
model="gpt-4-turbo", | |
response_format={"type": "json_object"}, | |
messages=[ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image"}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64;,{image}", | |
"detail": "high", | |
}, | |
}, | |
], | |
} | |
], | |
) | |
return json.loads(response.choices[0].message.content) | |
async def start_troggin_off(dir: str, client): | |
# recursive | |
import os | |
from pdf2image import convert_from_path | |
dirs = os.listdir(dir) | |
for path in dirs: | |
if os.path.isdir(os.path.join(dir, path)): | |
await start_troggin_off(os.path.join(dir, path), client) # recursive call | |
if(os.path.join(dir, path).endswith(".pdf")): | |
images = convert_from_path(os.path.join(dir, path)) | |
for i, image in enumerate(images): | |
encoded_image = encode_image(image) | |
text = await image_to_text(encoded_image) | |
update_collection(i, text, client) | |
if __name__ == "__main__": | |
import asyncio | |
client = create_client() | |
# client = None | |
asyncio.run(start_troggin_off("data/", client)) | |