Rahkakavee Baskaran commited on
Commit
3592072
1 Parent(s): 21e077c

add corpus embeddings

Browse files
Files changed (1) hide show
  1. creating_corpus_embeddings.py +26 -0
creating_corpus_embeddings.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from safetensors.torch import save_file
2
+ from sentence_transformers import SentenceTransformer
3
+ import json
4
+
5
+
6
+ # load taxonomy
7
+ with open("taxonomy_processed_v3.json", "r") as fp:
8
+ taxonomy = json.load(fp)
9
+
10
+ # load model
11
+ model = SentenceTransformer(
12
+ model_name_or_path="and-effect/musterdatenkatalog_clf",
13
+ device="cpu",
14
+ use_auth_token=True,
15
+ )
16
+
17
+ # get corpus labels
18
+ corpus_labels = [el["group"] + " - " + el["label"] for el in taxonomy]
19
+
20
+ # get corpus embeddings
21
+ corpus_embeddings = model.encode(corpus_labels, convert_to_tensor=True)
22
+
23
+ # save corpus embeddings
24
+ tensors = {"corpus_embeddings": corpus_embeddings}
25
+
26
+ save_file(tensors, "corpus_embeddings.pt")