code fixes for new model
Browse files- app.py +1 -1
- learn_multi_doc_model.py +74 -26
- topic_discovery/.DS_Store +0 -0
- topic_discovery/{cvect_25000_ar.pkl β cvect_100000_ar.pkl} +2 -2
- topic_discovery/{cvect_25000_bn.pkl β cvect_100000_bn.pkl} +2 -2
- topic_discovery/{cvect_25000_de.pkl β cvect_100000_de.pkl} +2 -2
- topic_discovery/{cvect_25000_el.pkl β cvect_100000_el.pkl} +2 -2
- topic_discovery/cvect_100000_en.pkl +3 -0
- topic_discovery/cvect_100000_es.pkl +3 -0
- topic_discovery/cvect_100000_fr.pkl +3 -0
- topic_discovery/cvect_100000_it.pkl +3 -0
- topic_discovery/cvect_100000_jp.pkl +3 -0
- topic_discovery/cvect_100000_mk.pkl +3 -0
- topic_discovery/cvect_100000_nl.pkl +3 -0
- topic_discovery/cvect_100000_pl.pkl +3 -0
- topic_discovery/cvect_100000_pt.pkl +3 -0
- topic_discovery/cvect_100000_ru.pkl +3 -0
- topic_discovery/cvect_100000_zhs.pkl +3 -0
- topic_discovery/cvect_100000_zht.pkl +3 -0
- topic_discovery/cvect_25000_en.pkl +0 -3
- topic_discovery/cvect_25000_es.pkl +0 -3
- topic_discovery/cvect_25000_fr.pkl +0 -3
- topic_discovery/cvect_25000_it.pkl +0 -3
- topic_discovery/cvect_25000_jp.pkl +0 -3
- topic_discovery/cvect_25000_mg.pkl +0 -3
- topic_discovery/cvect_25000_mk.pkl +0 -3
- topic_discovery/cvect_25000_nl.pkl +0 -3
- topic_discovery/cvect_25000_pl.pkl +0 -3
- topic_discovery/cvect_25000_pt.pkl +0 -3
- topic_discovery/cvect_25000_ru.pkl +0 -3
- topic_discovery/cvect_25000_zhs.pkl +0 -3
- topic_discovery/cvect_25000_zht.pkl +0 -3
- topic_discovery/cvects.key +17 -17
app.py
CHANGED
@@ -71,7 +71,7 @@ def get_words(doc_emb):
|
|
71 |
|
72 |
#print(lang, end=": ")
|
73 |
|
74 |
-
scores = mul_model.E[lang] @ (doc_emb).T
|
75 |
k_ixs = np.argsort(scores)[::-1][:topn].squeeze() # sort them in descending order and pick topn
|
76 |
tmp = []
|
77 |
for i in k_ixs:
|
|
|
71 |
|
72 |
#print(lang, end=": ")
|
73 |
|
74 |
+
scores = mul_model.E[lang].detach().numpy() @ (doc_emb).T
|
75 |
k_ixs = np.argsort(scores)[::-1][:topn].squeeze() # sort them in descending order and pick topn
|
76 |
tmp = []
|
77 |
for i in k_ixs:
|
learn_multi_doc_model.py
CHANGED
@@ -8,13 +8,15 @@ import pickle
|
|
8 |
from scipy.special import log_softmax
|
9 |
from time import time
|
10 |
from packaging import version
|
|
|
11 |
|
12 |
assert version.parse(scipy.__version__) >= version.parse(
|
13 |
"1.7.0"
|
14 |
), f"Requries scipy > 1.7.0. Found {scipy.__version__}"
|
15 |
|
|
|
16 |
|
17 |
-
class Model:
|
18 |
"""Model defintion, parameters and helper fucntions to compute log-likelihood"""
|
19 |
|
20 |
def __init__(self, vocab: dict, emb_dim: int):
|
@@ -25,23 +27,31 @@ class Model:
|
|
25 |
emb_dim: embedding dimension, will be same across languages
|
26 |
"""
|
27 |
|
|
|
|
|
28 |
self.L = len(vocab)
|
29 |
self.vocab = vocab
|
30 |
self.emb_dim = emb_dim
|
31 |
|
32 |
# word embeddings matrix / subspace for each language
|
33 |
-
self.E = {}
|
|
|
|
|
34 |
|
35 |
# bias vector for each language
|
36 |
-
self.b = {}
|
|
|
|
|
37 |
|
38 |
n1 = 1.0 / np.sqrt(emb_dim)
|
39 |
|
40 |
# initialize word embeddings and bias vectors randomly
|
41 |
for lang, vocab_size in vocab.items():
|
42 |
n2 = 1.0 / np.sqrt(vocab_size)
|
43 |
-
self.E[lang] = np.random.uniform(-n2, n1, size=(vocab_size, emb_dim))
|
44 |
-
self.
|
|
|
|
|
45 |
|
46 |
def init_bias_with_log_unigram_dist(self, X, lang):
|
47 |
"""We will initialize the bias vector with log of unigram distribution over vocabulary.
|
@@ -56,9 +66,13 @@ class Model:
|
|
56 |
else:
|
57 |
X = X.A + 1e-08 # to avoid any zeros
|
58 |
|
59 |
-
self.b[lang][:, 0] = np.log(
|
60 |
-
|
61 |
-
) # we would like b to of size (W, 1)
|
|
|
|
|
|
|
|
|
62 |
|
63 |
def compute_log_thetas(self, lang: str, DE_lang: np.ndarray, sanity_check=False):
|
64 |
"""Compute log of thetas, where theta_d is the unigram distribution over document `d`
|
@@ -70,14 +84,17 @@ class Model:
|
|
70 |
DE_lang (np.ndarray): Document embeddings of language
|
71 |
"""
|
72 |
|
73 |
-
mat = self.b[lang] + (self.E[lang] @ DE_lang) # shape is vocab_size x n_docs
|
|
|
|
|
|
|
74 |
mat = mat.T # shape is D x W
|
75 |
|
76 |
# log_norm = logsumexp(mat, axis=1)
|
77 |
# log_thetas = mat - log_norm
|
78 |
|
79 |
# the following single step is same the two above steps combined
|
80 |
-
log_thetas = log_softmax(mat, axis=1) # shape is n_docs x vocab_size
|
81 |
|
82 |
if sanity_check:
|
83 |
n_docs = DE_lang.shape[0]
|
@@ -114,9 +131,22 @@ class Model:
|
|
114 |
llh = (X * log_thetas).sum()
|
115 |
else:
|
116 |
# X is a scipy sparse matrix
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
-
return llh
|
120 |
|
121 |
|
122 |
def gradients_WE(model, lang, DE_lang, X, alpha):
|
@@ -156,9 +186,13 @@ def gradients_WE(model, lang, DE_lang, X, alpha):
|
|
156 |
ef_grads = np.zeros_like(model.E)
|
157 |
|
158 |
tmp = (
|
159 |
-
|
160 |
).A # .A will convert matrix to np ndarray
|
161 |
-
ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * model.E[lang]).sum()
|
|
|
|
|
|
|
|
|
162 |
|
163 |
# Sanity check to see if gradients computed in both ways are numerically identical
|
164 |
# print('- All close grad_E:', np.allclose(ef_grads, grads))
|
@@ -181,14 +215,14 @@ def update_parameters(params, gradient, learning_rate):
|
|
181 |
"""
|
182 |
|
183 |
assert (
|
184 |
-
|
185 |
), "The params and gradient must have same shape, \
|
186 |
({:d}, {:d}) != ({:d} {:d})".format(
|
187 |
*params.shape, *gradient.shape
|
188 |
)
|
189 |
|
190 |
-
new_params = params + (
|
191 |
-
|
192 |
) # since we are doing gradient ascent
|
193 |
return new_params
|
194 |
|
@@ -197,6 +231,7 @@ def train(model, bow, DE, args):
|
|
197 |
"""Training scheme for the model"""
|
198 |
|
199 |
print("\nTraining started ..")
|
|
|
200 |
learning_rate = args.lr
|
201 |
llh_0 = 0.0
|
202 |
for lang, X in bow.items():
|
@@ -209,7 +244,13 @@ def train(model, bow, DE, args):
|
|
209 |
|
210 |
llh_ei = 0.0
|
211 |
for lang, X in bow.items():
|
|
|
|
|
|
|
212 |
|
|
|
|
|
|
|
213 |
# update word embeddings E for lang, by keeping doc-embeddings A fixed
|
214 |
grad_E = gradients_WE(model, lang, DE[lang].T, X, args.alpha)
|
215 |
|
@@ -217,6 +258,13 @@ def train(model, bow, DE, args):
|
|
217 |
|
218 |
llh_ei += model.compute_log_likelihood(lang, DE[lang].T, X)
|
219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
print(
|
221 |
"Epoch {:4d} / {:4d} | Log-likelihood: {:16.2f} | Learning rate: {:f}".format(
|
222 |
i, args.epochs, llh_ei, learning_rate
|
@@ -229,18 +277,18 @@ def train(model, bow, DE, args):
|
|
229 |
"Instead it decreased, which means the updates have overshooted.",
|
230 |
"Halving the learning_rate.",
|
231 |
)
|
232 |
-
learning_rate = learning_rate * 0.5
|
233 |
|
234 |
llhs.append(llh_ei)
|
235 |
|
236 |
-
#
|
237 |
# we reduce the learning_rate by 10 % after every 10 epochs
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
if i % 100 == 0:
|
242 |
with open(
|
243 |
-
|
244 |
) as fpw:
|
245 |
pickle.dump(model, fpw)
|
246 |
np.savetxt(
|
@@ -283,12 +331,13 @@ def main():
|
|
283 |
|
284 |
# assert the number of docs per language are same in embeddings and bag-of-words
|
285 |
assert (
|
286 |
-
|
287 |
), "Number of docs in BoW ({:d}) != number of docs in embeddigs ({:d}) for language: {:s}".format(
|
288 |
bows[lang].shape[0], doc_embs[lang].shape[0], lang
|
289 |
)
|
290 |
|
291 |
model = Model(vocab, emb_dim)
|
|
|
292 |
for lang, bow in bows.items():
|
293 |
model.init_bias_with_log_unigram_dist(bow, lang)
|
294 |
|
@@ -304,7 +353,7 @@ def main():
|
|
304 |
model, llhs = train(model, bows, doc_embs, args)
|
305 |
|
306 |
with open(
|
307 |
-
|
308 |
) as fpw:
|
309 |
pickle.dump(model, fpw)
|
310 |
|
@@ -317,7 +366,6 @@ def main():
|
|
317 |
|
318 |
|
319 |
def parse_arguments():
|
320 |
-
|
321 |
parser = argparse.ArgumentParser(
|
322 |
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
323 |
)
|
|
|
8 |
from scipy.special import log_softmax
|
9 |
from time import time
|
10 |
from packaging import version
|
11 |
+
import torch
|
12 |
|
13 |
assert version.parse(scipy.__version__) >= version.parse(
|
14 |
"1.7.0"
|
15 |
), f"Requries scipy > 1.7.0. Found {scipy.__version__}"
|
16 |
|
17 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
18 |
|
19 |
+
class Model(torch.nn.Module):
|
20 |
"""Model defintion, parameters and helper fucntions to compute log-likelihood"""
|
21 |
|
22 |
def __init__(self, vocab: dict, emb_dim: int):
|
|
|
27 |
emb_dim: embedding dimension, will be same across languages
|
28 |
"""
|
29 |
|
30 |
+
super().__init__()
|
31 |
+
|
32 |
self.L = len(vocab)
|
33 |
self.vocab = vocab
|
34 |
self.emb_dim = emb_dim
|
35 |
|
36 |
# word embeddings matrix / subspace for each language
|
37 |
+
# self.E = {} # torch.nn.ModuleDict
|
38 |
+
# self.E = torch.nn.ModuleDict()
|
39 |
+
self.E = torch.nn.ParameterDict()
|
40 |
|
41 |
# bias vector for each language
|
42 |
+
# self.b = {} # torch.nn.ModuleDict
|
43 |
+
# self.b = torch.nn.ModuleDict()
|
44 |
+
self.b = torch.nn.ParameterDict()
|
45 |
|
46 |
n1 = 1.0 / np.sqrt(emb_dim)
|
47 |
|
48 |
# initialize word embeddings and bias vectors randomly
|
49 |
for lang, vocab_size in vocab.items():
|
50 |
n2 = 1.0 / np.sqrt(vocab_size)
|
51 |
+
# self.E[lang] = torch.nn.ParameterList(torch.from_numpy(np.random.uniform(-n2, n1, size=(vocab_size, emb_dim))))
|
52 |
+
self.E[lang] = torch.nn.Parameter(torch.Tensor(np.random.uniform(-n2, n1, size=(vocab_size, emb_dim))),
|
53 |
+
requires_grad=True).to(device)
|
54 |
+
self.b[lang] = torch.nn.Parameter(torch.Tensor(np.random.randn(vocab_size, 1) * 0.0001), requires_grad=True).to(device)
|
55 |
|
56 |
def init_bias_with_log_unigram_dist(self, X, lang):
|
57 |
"""We will initialize the bias vector with log of unigram distribution over vocabulary.
|
|
|
66 |
else:
|
67 |
X = X.A + 1e-08 # to avoid any zeros
|
68 |
|
69 |
+
# self.b[lang][:, 0] = np.log(
|
70 |
+
# X.sum(axis=0) / X.sum()
|
71 |
+
# ) # we would like b to of size (W, 1)
|
72 |
+
|
73 |
+
b_copy = self.b[lang].clone()
|
74 |
+
b_copy[:, 0] = torch.from_numpy(np.log(X.sum(axis=0) / X.sum()))
|
75 |
+
self.b[lang] = torch.nn.Parameter(b_copy, requires_grad=True)
|
76 |
|
77 |
def compute_log_thetas(self, lang: str, DE_lang: np.ndarray, sanity_check=False):
|
78 |
"""Compute log of thetas, where theta_d is the unigram distribution over document `d`
|
|
|
84 |
DE_lang (np.ndarray): Document embeddings of language
|
85 |
"""
|
86 |
|
87 |
+
# mat = self.b[lang] + (self.E[lang] @ DE_lang) # shape is vocab_size x n_docs
|
88 |
+
mat = self.b[lang] + (self.E[lang].double() @ torch.from_numpy(DE_lang).double().to(device))
|
89 |
+
# mat = mat.detach()
|
90 |
+
# mat = mat.detach().T
|
91 |
mat = mat.T # shape is D x W
|
92 |
|
93 |
# log_norm = logsumexp(mat, axis=1)
|
94 |
# log_thetas = mat - log_norm
|
95 |
|
96 |
# the following single step is same the two above steps combined
|
97 |
+
log_thetas = log_softmax(mat.detach().numpy(), axis=1) # shape is n_docs x vocab_size
|
98 |
|
99 |
if sanity_check:
|
100 |
n_docs = DE_lang.shape[0]
|
|
|
131 |
llh = (X * log_thetas).sum()
|
132 |
else:
|
133 |
# X is a scipy sparse matrix
|
134 |
+
# this is the tricky part in pytorch
|
135 |
+
|
136 |
+
coo = X.tocoo()
|
137 |
+
|
138 |
+
row_ixs = torch.LongTensor(coo.row).to(device)
|
139 |
+
col_ixs = torch.LongTensor(coo.col).to(device)
|
140 |
+
data = torch.FloatTensor(coo.data).to(device)
|
141 |
+
|
142 |
+
# llh = (X.multiply(log_thetas)).sum()
|
143 |
+
|
144 |
+
log_thetas_tensor = torch.from_numpy(log_thetas)
|
145 |
+
|
146 |
+
llh = (log_thetas_tensor[row_ixs, col_ixs] * data).sum()
|
147 |
+
# TODO row_ixs, col_ixs, data
|
148 |
|
149 |
+
return llh * (-1.0) # * -1.0 when using pytorch to get negative llh (loss)
|
150 |
|
151 |
|
152 |
def gradients_WE(model, lang, DE_lang, X, alpha):
|
|
|
186 |
ef_grads = np.zeros_like(model.E)
|
187 |
|
188 |
tmp = (
|
189 |
+
X - np.multiply(X.sum(axis=1).reshape(-1, 1), np.exp(log_thetas))
|
190 |
).A # .A will convert matrix to np ndarray
|
191 |
+
# ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * model.E[lang]).sum(axis=1, keepdims=True)
|
192 |
+
|
193 |
+
m = model.E[lang].detach().numpy()
|
194 |
+
# ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * model.E[lang]).sum(axis=1, keepdims=True)
|
195 |
+
ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * m).sum(axis=1, keepdims=True)
|
196 |
|
197 |
# Sanity check to see if gradients computed in both ways are numerically identical
|
198 |
# print('- All close grad_E:', np.allclose(ef_grads, grads))
|
|
|
215 |
"""
|
216 |
|
217 |
assert (
|
218 |
+
params.shape == gradient.shape
|
219 |
), "The params and gradient must have same shape, \
|
220 |
({:d}, {:d}) != ({:d} {:d})".format(
|
221 |
*params.shape, *gradient.shape
|
222 |
)
|
223 |
|
224 |
+
new_params = params.detach() + (
|
225 |
+
learning_rate * gradient
|
226 |
) # since we are doing gradient ascent
|
227 |
return new_params
|
228 |
|
|
|
231 |
"""Training scheme for the model"""
|
232 |
|
233 |
print("\nTraining started ..")
|
234 |
+
optim = torch.optim.Adam(model.parameters(), lr=args.lr)
|
235 |
learning_rate = args.lr
|
236 |
llh_0 = 0.0
|
237 |
for lang, X in bow.items():
|
|
|
244 |
|
245 |
llh_ei = 0.0
|
246 |
for lang, X in bow.items():
|
247 |
+
# for pytorch
|
248 |
+
optim.zero_grad()
|
249 |
+
# get row_ixs, col_ixs, data from X
|
250 |
|
251 |
+
# compute neg llh
|
252 |
+
#loss = torch.tensor(llh_ei, requires_grad=True)
|
253 |
+
#loss = torch.as_tensor(llh_ei).detach().clone()
|
254 |
# update word embeddings E for lang, by keeping doc-embeddings A fixed
|
255 |
grad_E = gradients_WE(model, lang, DE[lang].T, X, args.alpha)
|
256 |
|
|
|
258 |
|
259 |
llh_ei += model.compute_log_likelihood(lang, DE[lang].T, X)
|
260 |
|
261 |
+
loss = torch.tensor(llh_ei, requires_grad=True)
|
262 |
+
loss.backward()
|
263 |
+
|
264 |
+
optim.step()
|
265 |
+
|
266 |
+
|
267 |
+
|
268 |
print(
|
269 |
"Epoch {:4d} / {:4d} | Log-likelihood: {:16.2f} | Learning rate: {:f}".format(
|
270 |
i, args.epochs, llh_ei, learning_rate
|
|
|
277 |
"Instead it decreased, which means the updates have overshooted.",
|
278 |
"Halving the learning_rate.",
|
279 |
)
|
280 |
+
#learning_rate = learning_rate * 0.5
|
281 |
|
282 |
llhs.append(llh_ei)
|
283 |
|
284 |
+
# ylearning_rate scheduler
|
285 |
# we reduce the learning_rate by 10 % after every 10 epochs
|
286 |
+
if i % 10 == 0:
|
287 |
+
print("Reducing the learning by a factor of 0.1 every 10 epcohs")
|
288 |
+
learning_rate -= learning_rate * 0.1
|
289 |
if i % 100 == 0:
|
290 |
with open(
|
291 |
+
os.path.join(args.out_dir, f"model_{args.alpha}_{i}.pkl"), "wb"
|
292 |
) as fpw:
|
293 |
pickle.dump(model, fpw)
|
294 |
np.savetxt(
|
|
|
331 |
|
332 |
# assert the number of docs per language are same in embeddings and bag-of-words
|
333 |
assert (
|
334 |
+
bows[lang].shape[0] == doc_embs[lang].shape[0]
|
335 |
), "Number of docs in BoW ({:d}) != number of docs in embeddigs ({:d}) for language: {:s}".format(
|
336 |
bows[lang].shape[0], doc_embs[lang].shape[0], lang
|
337 |
)
|
338 |
|
339 |
model = Model(vocab, emb_dim)
|
340 |
+
model.to(device)
|
341 |
for lang, bow in bows.items():
|
342 |
model.init_bias_with_log_unigram_dist(bow, lang)
|
343 |
|
|
|
353 |
model, llhs = train(model, bows, doc_embs, args)
|
354 |
|
355 |
with open(
|
356 |
+
os.path.join(args.out_dir, f"model_{args.alpha}_{args.epochs}.pkl"), "wb"
|
357 |
) as fpw:
|
358 |
pickle.dump(model, fpw)
|
359 |
|
|
|
366 |
|
367 |
|
368 |
def parse_arguments():
|
|
|
369 |
parser = argparse.ArgumentParser(
|
370 |
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
371 |
)
|
topic_discovery/.DS_Store
CHANGED
Binary files a/topic_discovery/.DS_Store and b/topic_discovery/.DS_Store differ
|
|
topic_discovery/{cvect_25000_ar.pkl β cvect_100000_ar.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc2a5a244e0554ffd18efee81ece7f4997136eb7caa6fdaa142e44e264983291
|
3 |
+
size 3232315
|
topic_discovery/{cvect_25000_bn.pkl β cvect_100000_bn.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f27fa832a05cf8d6b2389a83e490526bf54763940b49f817f3ace830b332200c
|
3 |
+
size 125341
|
topic_discovery/{cvect_25000_de.pkl β cvect_100000_de.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ea2571476bac39c97563a7d2ec94879de43abd9877edbbadb76dcda600167f5
|
3 |
+
size 1869324
|
topic_discovery/{cvect_25000_el.pkl β cvect_100000_el.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10fa08af8eccba5723694050feffdc7fdc7c698d7b62502b499534e7493d8ab1
|
3 |
+
size 4068227
|
topic_discovery/cvect_100000_en.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:25da04442b2ff7ffae9bbe69f09e464aadf4a389c70fe6f283e9b0309d636a81
|
3 |
+
size 5019023
|
topic_discovery/cvect_100000_es.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70e1d60edf8e6b09736968209cf476fd60db5b12e3ad221593ac02061ec13307
|
3 |
+
size 5396926
|
topic_discovery/cvect_100000_fr.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34af4a4569ccd5e82c9218a288fe92091d3d7444f3b3a570707dd03ad464150c
|
3 |
+
size 3513313
|
topic_discovery/cvect_100000_it.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2135701318838fcee53219515e369ef7b2cb9451884b18e734ee840372f34bd7
|
3 |
+
size 2810588
|
topic_discovery/cvect_100000_jp.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8dbb2536a0f269c7ab9e71f38cffb69f3fd5925d7b43eabcdc5050f1bea5b6f9
|
3 |
+
size 3040253
|
topic_discovery/cvect_100000_mk.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a3ef12ac4c0952ca01c9191cb78c2e63e80b77c8a1faaae05133ff23ad26f161
|
3 |
+
size 1931126
|
topic_discovery/cvect_100000_nl.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:39d3baf77620a5b2a4c8084af2d442595a968cad90b05653fc328870171e3733
|
3 |
+
size 1159719
|
topic_discovery/cvect_100000_pl.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dcde383c79f519a4bce289379008b31fd35598df920cb6ad55fa3a2aa305a56d
|
3 |
+
size 1981167
|
topic_discovery/cvect_100000_pt.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a363d8aa2f5b1ef3877d429e02bd7823374989d5f10ed9814c574860e317b698
|
3 |
+
size 2068800
|
topic_discovery/cvect_100000_ru.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6beff3c5bd38755ac0ab8d34b6d86589d2777ed9361f0c59cdda9e7d04ef6031
|
3 |
+
size 6251799
|
topic_discovery/cvect_100000_zhs.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d4d1f8f6d9b0d37d6a2c03df52711d4de319a74ef6b0d5229e2fe7f7115f3fe6
|
3 |
+
size 9212102
|
topic_discovery/cvect_100000_zht.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34538fb71bf08905b8461484e152c2f950515c03e8ffc0e427b1ec1db5ee3406
|
3 |
+
size 9724371
|
topic_discovery/cvect_25000_en.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:cb0ee36e4ef6738d408e30132c5d970be2e05728c305fccce06dc67b3941bea2
|
3 |
-
size 4143980
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_es.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d28eb842e6f4717a791de9c8c61014131dbea8d26f84f90c62cd54b05595a1c9
|
3 |
-
size 4235561
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_fr.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:74ff26b2269c2033f78ecb1e5870c449423d42d668975e5e98e899b6d2489f64
|
3 |
-
size 2967490
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_it.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4e8892d88fd88e0d9e121e57e1b77810e47d34909944b2e65e2094d426f17daa
|
3 |
-
size 2477565
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_jp.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2c075e83209a4a23afe290aef6a301717f4eadfd118a278114ea142fdf882c20
|
3 |
-
size 3082086
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_mg.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:958dd98498097b8463b1fbc6f068b512650d40397b9e53659dc2238032126181
|
3 |
-
size 3643714
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_mk.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:6758e48f3626b7c91b7359097d27aedb6beaeb36c6a6632901c3fae3f6da5ea3
|
3 |
-
size 2152452
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_nl.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:5f81d4942757d07cde33715cd00fe150c377b19070f57cc992230b8c6eeacb06
|
3 |
-
size 1466263
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_pl.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ad1d1d8853aa424ba47c81d52ab6fdd708d1a440901652d680482d092a88a44a
|
3 |
-
size 2063425
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_pt.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:baef6e3fe017ed4feb3ac2e08701b77b4425ade9f39d700ab3d1b4a2d89059d6
|
3 |
-
size 2001188
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_ru.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:89bfa381364b0df772b0a181df8740bf597733e328410c464e6690d58e8e212f
|
3 |
-
size 5482015
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_zhs.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:1369c082d071340da56006eef8ffc380625c39fef4a7034b7d1e2927b1f54717
|
3 |
-
size 9390903
|
|
|
|
|
|
|
|
topic_discovery/cvect_25000_zht.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:030a1c4b66cfecf4645de14f77d90d56886e8927225581c94e45a93006c0c633
|
3 |
-
size 9965443
|
|
|
|
|
|
|
|
topic_discovery/cvects.key
CHANGED
@@ -1,17 +1,17 @@
|
|
1 |
-
en topic_discovery/
|
2 |
-
es topic_discovery/
|
3 |
-
fr topic_discovery/
|
4 |
-
mg topic_discovery/
|
5 |
-
it topic_discovery/
|
6 |
-
el topic_discovery/
|
7 |
-
zhs topic_discovery/
|
8 |
-
zht topic_discovery/
|
9 |
-
bn topic_discovery/
|
10 |
-
ru topic_discovery/
|
11 |
-
pt topic_discovery/
|
12 |
-
ar topic_discovery/
|
13 |
-
de topic_discovery/
|
14 |
-
jp topic_discovery/
|
15 |
-
mk topic_discovery/
|
16 |
-
pl topic_discovery/
|
17 |
-
nl topic_discovery/
|
|
|
1 |
+
en topic_discovery/cvect_100000_en.pkl
|
2 |
+
es topic_discovery/cvect_100000_es.pkl
|
3 |
+
fr topic_discovery/cvect_100000_fr.pkl
|
4 |
+
mg topic_discovery/cvect_100000_mg.pkl
|
5 |
+
it topic_discovery/cvect_100000_it.pkl
|
6 |
+
el topic_discovery/cvect_100000_el.pkl
|
7 |
+
zhs topic_discovery/cvect_100000_zhs.pkl
|
8 |
+
zht topic_discovery/cvect_100000_zht.pkl
|
9 |
+
bn topic_discovery/cvect_100000_bn.pkl
|
10 |
+
ru topic_discovery/cvect_100000_ru.pkl
|
11 |
+
pt topic_discovery/cvect_100000_pt.pkl
|
12 |
+
ar topic_discovery/cvect_100000_ar.pkl
|
13 |
+
de topic_discovery/cvect_100000_de.pkl
|
14 |
+
jp topic_discovery/cvect_100000_jp.pkl
|
15 |
+
mk topic_discovery/cvect_100000_mk.pkl
|
16 |
+
pl topic_discovery/cvect_100000_pl.pkl
|
17 |
+
nl topic_discovery/cvect_100000_nl.pkl
|