adorkin commited on
Commit
ade3b7e
1 Parent(s): 4e8a334

Add steps for reproducibility

Browse files
Files changed (3) hide show
  1. build_ann.py +39 -0
  2. create_db.py +24 -0
  3. vectorize.py +34 -0
build_ann.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from annoy import AnnoyIndex
2
+ from safetensors import safe_open
3
+ from tqdm import trange
4
+
5
+ safetensors_path = "definitions.safetensors"
6
+
7
+ with safe_open(safetensors_path, framework="numpy") as f:
8
+ vectors = f.get_tensor("vectors")
9
+
10
+ num_vectors, vector_dim = vectors.shape
11
+
12
+ print(f"Loaded {num_vectors} vectors of dimension {vector_dim}")
13
+
14
+ index = AnnoyIndex(vector_dim, "angular")
15
+
16
+ for i in trange(num_vectors):
17
+ index.add_item(i, vectors[i])
18
+
19
+ num_trees = 25
20
+ index.build(num_trees)
21
+
22
+ index.save("definitions.ann")
23
+
24
+ query_vector = vectors[0]
25
+ num_neighbors = 5
26
+
27
+ nearest_neighbors = index.get_nns_by_vector(query_vector, num_neighbors)
28
+ print(f"Indices of {num_neighbors} nearest neighbors:", nearest_neighbors)
29
+
30
+ neighbors_with_distances = index.get_nns_by_vector(
31
+ query_vector, num_neighbors, include_distances=True
32
+ )
33
+ print("Neighbors with distances:", neighbors_with_distances)
34
+
35
+ import duckdb
36
+
37
+ conn = duckdb.connect("sonajaht.db")
38
+ query = "SELECT word_id, value FROM definitions WHERE entry_id in (SELECT unnest(?))"
39
+ print(conn.execute(query, [nearest_neighbors]).df())
create_db.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import duckdb
4
+ from datasets import load_dataset
5
+
6
+ definitions_ds = load_dataset("adorkin/sonajaht", "definitions")
7
+ words_ds = load_dataset("adorkin/sonajaht", "words")
8
+
9
+ definitions = definitions_ds["definitions"].to_pandas()
10
+ definitions.value = definitions.value.str.replace(
11
+ re.compile(r"<[^>]*>"), "", regex=True
12
+ ).apply(lambda el: " ".join(el.split()))
13
+
14
+ definitions = duckdb.query(
15
+ "SELECT * FROM definitions WHERE lang = 'est' AND LENGTH(value) > 5"
16
+ ).df()
17
+
18
+ definitions.reset_index(inplace=True, names="entry_id")
19
+
20
+ words = words_ds["words"].to_pandas()
21
+
22
+ conn = duckdb.connect("sonajaht.db")
23
+ conn.execute("CREATE TABLE definitions AS SELECT * FROM definitions")
24
+ conn.execute("CREATE TABLE words AS SELECT * FROM words")
vectorize.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import duckdb
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from safetensors.numpy import save_file
5
+ from tqdm import tqdm
6
+
7
+ conn = duckdb.connect("sonajaht.db")
8
+
9
+ model = SentenceTransformer("sentence-transformers/LaBSE")
10
+
11
+
12
+ query = "SELECT value FROM definitions"
13
+ result = conn.execute(query)
14
+
15
+ vectors = []
16
+ batch_size = 64
17
+
18
+ p_bar = tqdm()
19
+ while True:
20
+ chunk = result.fetchmany(batch_size)
21
+ if not chunk:
22
+ break
23
+ values = [row[0] for row in chunk]
24
+ vectors.append(
25
+ model.encode(
26
+ values, show_progress_bar=False, batch_size=batch_size, device="mps"
27
+ )
28
+ )
29
+ p_bar.update(batch_size)
30
+
31
+ vectors = np.concatenate(vectors)
32
+ save_file(dict(vectors=vectors), "definitions.safetensors")
33
+
34
+ conn.close()