Spaces:
Running
Running
File size: 1,073 Bytes
ade3b7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
from annoy import AnnoyIndex
from safetensors import safe_open
from tqdm import trange
safetensors_path = "definitions.safetensors"
with safe_open(safetensors_path, framework="numpy") as f:
vectors = f.get_tensor("vectors")
num_vectors, vector_dim = vectors.shape
print(f"Loaded {num_vectors} vectors of dimension {vector_dim}")
index = AnnoyIndex(vector_dim, "angular")
for i in trange(num_vectors):
index.add_item(i, vectors[i])
num_trees = 25
index.build(num_trees)
index.save("definitions.ann")
query_vector = vectors[0]
num_neighbors = 5
nearest_neighbors = index.get_nns_by_vector(query_vector, num_neighbors)
print(f"Indices of {num_neighbors} nearest neighbors:", nearest_neighbors)
neighbors_with_distances = index.get_nns_by_vector(
query_vector, num_neighbors, include_distances=True
)
print("Neighbors with distances:", neighbors_with_distances)
import duckdb
conn = duckdb.connect("sonajaht.db")
query = "SELECT word_id, value FROM definitions WHERE entry_id in (SELECT unnest(?))"
print(conn.execute(query, [nearest_neighbors]).df())
|