sonajaht-demo / vectorize.py
adorkin's picture
Add steps for reproducibility
ade3b7e verified
raw
history blame
765 Bytes
import duckdb
import numpy as np
from sentence_transformers import SentenceTransformer
from safetensors.numpy import save_file
from tqdm import tqdm
conn = duckdb.connect("sonajaht.db")
model = SentenceTransformer("sentence-transformers/LaBSE")
query = "SELECT value FROM definitions"
result = conn.execute(query)
vectors = []
batch_size = 64
p_bar = tqdm()
while True:
chunk = result.fetchmany(batch_size)
if not chunk:
break
values = [row[0] for row in chunk]
vectors.append(
model.encode(
values, show_progress_bar=False, batch_size=batch_size, device="mps"
)
)
p_bar.update(batch_size)
vectors = np.concatenate(vectors)
save_file(dict(vectors=vectors), "definitions.safetensors")
conn.close()