File size: 1,073 Bytes
ade3b7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from annoy import AnnoyIndex
from safetensors import safe_open
from tqdm import trange

safetensors_path = "definitions.safetensors"

with safe_open(safetensors_path, framework="numpy") as f:
    vectors = f.get_tensor("vectors")

num_vectors, vector_dim = vectors.shape

print(f"Loaded {num_vectors} vectors of dimension {vector_dim}")

index = AnnoyIndex(vector_dim, "angular")

for i in trange(num_vectors):
    index.add_item(i, vectors[i])

num_trees = 25
index.build(num_trees)

index.save("definitions.ann")

query_vector = vectors[0]
num_neighbors = 5

nearest_neighbors = index.get_nns_by_vector(query_vector, num_neighbors)
print(f"Indices of {num_neighbors} nearest neighbors:", nearest_neighbors)

neighbors_with_distances = index.get_nns_by_vector(
    query_vector, num_neighbors, include_distances=True
)
print("Neighbors with distances:", neighbors_with_distances)

import duckdb

conn = duckdb.connect("sonajaht.db")
query = "SELECT word_id, value FROM definitions WHERE entry_id in (SELECT unnest(?))"
print(conn.execute(query, [nearest_neighbors]).df())