Spaces:
Running
Running
Add steps for reproducibility
Browse files- build_ann.py +39 -0
- create_db.py +24 -0
- vectorize.py +34 -0
build_ann.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from annoy import AnnoyIndex
|
2 |
+
from safetensors import safe_open
|
3 |
+
from tqdm import trange
|
4 |
+
|
5 |
+
safetensors_path = "definitions.safetensors"
|
6 |
+
|
7 |
+
with safe_open(safetensors_path, framework="numpy") as f:
|
8 |
+
vectors = f.get_tensor("vectors")
|
9 |
+
|
10 |
+
num_vectors, vector_dim = vectors.shape
|
11 |
+
|
12 |
+
print(f"Loaded {num_vectors} vectors of dimension {vector_dim}")
|
13 |
+
|
14 |
+
index = AnnoyIndex(vector_dim, "angular")
|
15 |
+
|
16 |
+
for i in trange(num_vectors):
|
17 |
+
index.add_item(i, vectors[i])
|
18 |
+
|
19 |
+
num_trees = 25
|
20 |
+
index.build(num_trees)
|
21 |
+
|
22 |
+
index.save("definitions.ann")
|
23 |
+
|
24 |
+
query_vector = vectors[0]
|
25 |
+
num_neighbors = 5
|
26 |
+
|
27 |
+
nearest_neighbors = index.get_nns_by_vector(query_vector, num_neighbors)
|
28 |
+
print(f"Indices of {num_neighbors} nearest neighbors:", nearest_neighbors)
|
29 |
+
|
30 |
+
neighbors_with_distances = index.get_nns_by_vector(
|
31 |
+
query_vector, num_neighbors, include_distances=True
|
32 |
+
)
|
33 |
+
print("Neighbors with distances:", neighbors_with_distances)
|
34 |
+
|
35 |
+
import duckdb
|
36 |
+
|
37 |
+
conn = duckdb.connect("sonajaht.db")
|
38 |
+
query = "SELECT word_id, value FROM definitions WHERE entry_id in (SELECT unnest(?))"
|
39 |
+
print(conn.execute(query, [nearest_neighbors]).df())
|
create_db.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
import duckdb
|
4 |
+
from datasets import load_dataset
|
5 |
+
|
6 |
+
definitions_ds = load_dataset("adorkin/sonajaht", "definitions")
|
7 |
+
words_ds = load_dataset("adorkin/sonajaht", "words")
|
8 |
+
|
9 |
+
definitions = definitions_ds["definitions"].to_pandas()
|
10 |
+
definitions.value = definitions.value.str.replace(
|
11 |
+
re.compile(r"<[^>]*>"), "", regex=True
|
12 |
+
).apply(lambda el: " ".join(el.split()))
|
13 |
+
|
14 |
+
definitions = duckdb.query(
|
15 |
+
"SELECT * FROM definitions WHERE lang = 'est' AND LENGTH(value) > 5"
|
16 |
+
).df()
|
17 |
+
|
18 |
+
definitions.reset_index(inplace=True, names="entry_id")
|
19 |
+
|
20 |
+
words = words_ds["words"].to_pandas()
|
21 |
+
|
22 |
+
conn = duckdb.connect("sonajaht.db")
|
23 |
+
conn.execute("CREATE TABLE definitions AS SELECT * FROM definitions")
|
24 |
+
conn.execute("CREATE TABLE words AS SELECT * FROM words")
|
vectorize.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import duckdb
|
2 |
+
import numpy as np
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
from safetensors.numpy import save_file
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
conn = duckdb.connect("sonajaht.db")
|
8 |
+
|
9 |
+
model = SentenceTransformer("sentence-transformers/LaBSE")
|
10 |
+
|
11 |
+
|
12 |
+
query = "SELECT value FROM definitions"
|
13 |
+
result = conn.execute(query)
|
14 |
+
|
15 |
+
vectors = []
|
16 |
+
batch_size = 64
|
17 |
+
|
18 |
+
p_bar = tqdm()
|
19 |
+
while True:
|
20 |
+
chunk = result.fetchmany(batch_size)
|
21 |
+
if not chunk:
|
22 |
+
break
|
23 |
+
values = [row[0] for row in chunk]
|
24 |
+
vectors.append(
|
25 |
+
model.encode(
|
26 |
+
values, show_progress_bar=False, batch_size=batch_size, device="mps"
|
27 |
+
)
|
28 |
+
)
|
29 |
+
p_bar.update(batch_size)
|
30 |
+
|
31 |
+
vectors = np.concatenate(vectors)
|
32 |
+
save_file(dict(vectors=vectors), "definitions.safetensors")
|
33 |
+
|
34 |
+
conn.close()
|