Spaces:
Running
Running
import duckdb | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
from safetensors.numpy import save_file | |
from tqdm import tqdm | |
conn = duckdb.connect("sonajaht.db") | |
model = SentenceTransformer("sentence-transformers/LaBSE") | |
query = "SELECT value FROM definitions" | |
result = conn.execute(query) | |
vectors = [] | |
batch_size = 64 | |
p_bar = tqdm() | |
while True: | |
chunk = result.fetchmany(batch_size) | |
if not chunk: | |
break | |
values = [row[0] for row in chunk] | |
vectors.append( | |
model.encode( | |
values, show_progress_bar=False, batch_size=batch_size, device="mps" | |
) | |
) | |
p_bar.update(batch_size) | |
vectors = np.concatenate(vectors) | |
save_file(dict(vectors=vectors), "definitions.safetensors") | |
conn.close() | |