Spaces:
Sleeping
Sleeping
LordFarquaad42
commited on
Commit
•
1612f56
1
Parent(s):
a25aac3
hacker mode
Browse files- add_data.py +35 -21
- app.py +10 -2
add_data.py
CHANGED
@@ -2,6 +2,19 @@ import chromadb
|
|
2 |
from chromadb.utils import embedding_functions
|
3 |
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
def get_client():
|
6 |
client = chromadb.PersistentClient(path="./chromadb_linux/")
|
7 |
MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
|
@@ -20,11 +33,14 @@ def update_collection(iter: int, text: object, client: chromadb.Collection):
|
|
20 |
client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)])
|
21 |
|
22 |
|
23 |
-
def encode_image(
|
|
|
24 |
import base64
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
28 |
|
29 |
|
30 |
async def image_to_text(image) -> object:
|
@@ -55,28 +71,26 @@ async def image_to_text(image) -> object:
|
|
55 |
return json.loads(response.choices[0].message.content)
|
56 |
|
57 |
|
58 |
-
async def start_troggin_off(dir: str):
|
|
|
59 |
import os
|
60 |
from pdf2image import convert_from_path
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
if os.path.isdir(folder_path):
|
67 |
-
for file in os.listdir(folder_path):
|
68 |
-
if file.endswith(".pdf"):
|
69 |
-
print("Processing", file)
|
70 |
-
pdf_path = os.path.join(folder_path, file)
|
71 |
-
images = convert_from_path(pdf_path)
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
encoded_image = encode_image(f"out{i}.jpg")
|
76 |
-
text = await image_to_text(encoded_image)
|
77 |
-
update_collection(i, text, client)
|
78 |
|
|
|
|
|
|
|
|
|
79 |
|
80 |
if __name__ == "__main__":
|
81 |
import asyncio
|
82 |
-
|
|
|
|
|
|
2 |
from chromadb.utils import embedding_functions
|
3 |
|
4 |
|
5 |
+
def create_client():
|
6 |
+
client = chromadb.PersistentClient(path="./chromadb_linux/")
|
7 |
+
MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
|
8 |
+
COLLECTION_NAME: str = "schemer2"
|
9 |
+
EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
|
10 |
+
model_name=MODEL_NAME
|
11 |
+
)
|
12 |
+
schemer = client.get_collection(
|
13 |
+
name=COLLECTION_NAME,
|
14 |
+
embedding_function=EMBEDDING_FUNC,
|
15 |
+
)
|
16 |
+
return schemer
|
17 |
+
|
18 |
def get_client():
|
19 |
client = chromadb.PersistentClient(path="./chromadb_linux/")
|
20 |
MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
|
|
|
33 |
client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)])
|
34 |
|
35 |
|
36 |
+
def encode_image(image) -> str:
|
37 |
+
import io
|
38 |
import base64
|
39 |
+
|
40 |
+
byte_arr = io.BytesIO()
|
41 |
+
image.save(byte_arr, format="JPEG")
|
42 |
+
encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
|
43 |
+
return encoded_image
|
44 |
|
45 |
|
46 |
async def image_to_text(image) -> object:
|
|
|
71 |
return json.loads(response.choices[0].message.content)
|
72 |
|
73 |
|
74 |
+
async def start_troggin_off(dir: str, client):
|
75 |
+
# recursive
|
76 |
import os
|
77 |
from pdf2image import convert_from_path
|
78 |
|
79 |
+
dirs = os.listdir(dir)
|
80 |
+
for path in dirs:
|
81 |
+
if os.path.isdir(os.path.join(dir, path)):
|
82 |
+
await start_troggin_off(os.path.join(dir, path), client) # recursive call
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
+
if(os.path.join(dir, path).endswith(".pdf")):
|
85 |
+
images = convert_from_path(os.path.join(dir, path))
|
|
|
|
|
|
|
86 |
|
87 |
+
for i, image in enumerate(images):
|
88 |
+
encoded_image = encode_image(image)
|
89 |
+
text = await image_to_text(encoded_image)
|
90 |
+
update_collection(i, text, client)
|
91 |
|
92 |
if __name__ == "__main__":
|
93 |
import asyncio
|
94 |
+
client = create_client()
|
95 |
+
# client = None
|
96 |
+
asyncio.run(start_troggin_off("data/", client))
|
app.py
CHANGED
@@ -2,8 +2,10 @@ import streamlit as st
|
|
2 |
from openai import OpenAI
|
3 |
from params import params
|
4 |
from database import get_client
|
|
|
5 |
|
6 |
-
CLIENT = get_client()
|
|
|
7 |
APP_NAME: str = "Groove-GPT"
|
8 |
history = []
|
9 |
st.set_page_config(layout="wide")
|
@@ -11,6 +13,12 @@ st.set_page_config(layout="wide")
|
|
11 |
# INFO
|
12 |
st.title(APP_NAME)
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
l_col, r_col = st.columns((3, 1))
|
15 |
|
16 |
# param column
|
@@ -41,7 +49,7 @@ with l_col:
|
|
41 |
)
|
42 |
documents = results["documents"]
|
43 |
response = openai_client.chat.completions.create(
|
44 |
-
model=
|
45 |
messages=[
|
46 |
{
|
47 |
"role": "system",
|
|
|
2 |
from openai import OpenAI
|
3 |
from params import params
|
4 |
from database import get_client
|
5 |
+
from add_data import start_troggin_off, create_client
|
6 |
|
7 |
+
# CLIENT = get_client()
|
8 |
+
CLIENT = None
|
9 |
APP_NAME: str = "Groove-GPT"
|
10 |
history = []
|
11 |
st.set_page_config(layout="wide")
|
|
|
13 |
# INFO
|
14 |
st.title(APP_NAME)
|
15 |
|
16 |
+
|
17 |
+
start_embedding = st.button("Hacker man")
|
18 |
+
if start_embedding:
|
19 |
+
CLIENT = create_client()
|
20 |
+
start_troggin_off("./data", CLIENT)
|
21 |
+
|
22 |
l_col, r_col = st.columns((3, 1))
|
23 |
|
24 |
# param column
|
|
|
49 |
)
|
50 |
documents = results["documents"]
|
51 |
response = openai_client.chat.completions.create(
|
52 |
+
model=gpt_type,
|
53 |
messages=[
|
54 |
{
|
55 |
"role": "system",
|