andufkova commited on
Commit
cc7a4cf
β€’
1 Parent(s): c0f3faf

code fixes for new model

Browse files
app.py CHANGED
@@ -71,7 +71,7 @@ def get_words(doc_emb):
71
 
72
  #print(lang, end=": ")
73
 
74
- scores = mul_model.E[lang] @ (doc_emb).T
75
  k_ixs = np.argsort(scores)[::-1][:topn].squeeze() # sort them in descending order and pick topn
76
  tmp = []
77
  for i in k_ixs:
 
71
 
72
  #print(lang, end=": ")
73
 
74
+ scores = mul_model.E[lang].detach().numpy() @ (doc_emb).T
75
  k_ixs = np.argsort(scores)[::-1][:topn].squeeze() # sort them in descending order and pick topn
76
  tmp = []
77
  for i in k_ixs:
learn_multi_doc_model.py CHANGED
@@ -8,13 +8,15 @@ import pickle
8
  from scipy.special import log_softmax
9
  from time import time
10
  from packaging import version
 
11
 
12
  assert version.parse(scipy.__version__) >= version.parse(
13
  "1.7.0"
14
  ), f"Requries scipy > 1.7.0. Found {scipy.__version__}"
15
 
 
16
 
17
- class Model:
18
  """Model defintion, parameters and helper fucntions to compute log-likelihood"""
19
 
20
  def __init__(self, vocab: dict, emb_dim: int):
@@ -25,23 +27,31 @@ class Model:
25
  emb_dim: embedding dimension, will be same across languages
26
  """
27
 
 
 
28
  self.L = len(vocab)
29
  self.vocab = vocab
30
  self.emb_dim = emb_dim
31
 
32
  # word embeddings matrix / subspace for each language
33
- self.E = {}
 
 
34
 
35
  # bias vector for each language
36
- self.b = {}
 
 
37
 
38
  n1 = 1.0 / np.sqrt(emb_dim)
39
 
40
  # initialize word embeddings and bias vectors randomly
41
  for lang, vocab_size in vocab.items():
42
  n2 = 1.0 / np.sqrt(vocab_size)
43
- self.E[lang] = np.random.uniform(-n2, n1, size=(vocab_size, emb_dim))
44
- self.b[lang] = np.random.randn(vocab_size, 1) * 0.0001
 
 
45
 
46
  def init_bias_with_log_unigram_dist(self, X, lang):
47
  """We will initialize the bias vector with log of unigram distribution over vocabulary.
@@ -56,9 +66,13 @@ class Model:
56
  else:
57
  X = X.A + 1e-08 # to avoid any zeros
58
 
59
- self.b[lang][:, 0] = np.log(
60
- X.sum(axis=0) / X.sum()
61
- ) # we would like b to of size (W, 1)
 
 
 
 
62
 
63
  def compute_log_thetas(self, lang: str, DE_lang: np.ndarray, sanity_check=False):
64
  """Compute log of thetas, where theta_d is the unigram distribution over document `d`
@@ -70,14 +84,17 @@ class Model:
70
  DE_lang (np.ndarray): Document embeddings of language
71
  """
72
 
73
- mat = self.b[lang] + (self.E[lang] @ DE_lang) # shape is vocab_size x n_docs
 
 
 
74
  mat = mat.T # shape is D x W
75
 
76
  # log_norm = logsumexp(mat, axis=1)
77
  # log_thetas = mat - log_norm
78
 
79
  # the following single step is same the two above steps combined
80
- log_thetas = log_softmax(mat, axis=1) # shape is n_docs x vocab_size
81
 
82
  if sanity_check:
83
  n_docs = DE_lang.shape[0]
@@ -114,9 +131,22 @@ class Model:
114
  llh = (X * log_thetas).sum()
115
  else:
116
  # X is a scipy sparse matrix
117
- llh = (X.multiply(log_thetas)).sum()
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- return llh
120
 
121
 
122
  def gradients_WE(model, lang, DE_lang, X, alpha):
@@ -156,9 +186,13 @@ def gradients_WE(model, lang, DE_lang, X, alpha):
156
  ef_grads = np.zeros_like(model.E)
157
 
158
  tmp = (
159
- X - np.multiply(X.sum(axis=1).reshape(-1, 1), np.exp(log_thetas))
160
  ).A # .A will convert matrix to np ndarray
161
- ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * model.E[lang]).sum()
 
 
 
 
162
 
163
  # Sanity check to see if gradients computed in both ways are numerically identical
164
  # print('- All close grad_E:', np.allclose(ef_grads, grads))
@@ -181,14 +215,14 @@ def update_parameters(params, gradient, learning_rate):
181
  """
182
 
183
  assert (
184
- params.shape == gradient.shape
185
  ), "The params and gradient must have same shape, \
186
  ({:d}, {:d}) != ({:d} {:d})".format(
187
  *params.shape, *gradient.shape
188
  )
189
 
190
- new_params = params + (
191
- learning_rate * gradient
192
  ) # since we are doing gradient ascent
193
  return new_params
194
 
@@ -197,6 +231,7 @@ def train(model, bow, DE, args):
197
  """Training scheme for the model"""
198
 
199
  print("\nTraining started ..")
 
200
  learning_rate = args.lr
201
  llh_0 = 0.0
202
  for lang, X in bow.items():
@@ -209,7 +244,13 @@ def train(model, bow, DE, args):
209
 
210
  llh_ei = 0.0
211
  for lang, X in bow.items():
 
 
 
212
 
 
 
 
213
  # update word embeddings E for lang, by keeping doc-embeddings A fixed
214
  grad_E = gradients_WE(model, lang, DE[lang].T, X, args.alpha)
215
 
@@ -217,6 +258,13 @@ def train(model, bow, DE, args):
217
 
218
  llh_ei += model.compute_log_likelihood(lang, DE[lang].T, X)
219
 
 
 
 
 
 
 
 
220
  print(
221
  "Epoch {:4d} / {:4d} | Log-likelihood: {:16.2f} | Learning rate: {:f}".format(
222
  i, args.epochs, llh_ei, learning_rate
@@ -229,18 +277,18 @@ def train(model, bow, DE, args):
229
  "Instead it decreased, which means the updates have overshooted.",
230
  "Halving the learning_rate.",
231
  )
232
- learning_rate = learning_rate * 0.5
233
 
234
  llhs.append(llh_ei)
235
 
236
- # learning_rate scheduler
237
  # we reduce the learning_rate by 10 % after every 10 epochs
238
- # if i % 10 == 0:
239
- # print("Reducing the learning by a factor of 0.1 every 10 epcohs")
240
- # learning_rate -= learning_rate * 0.1
241
  if i % 100 == 0:
242
  with open(
243
- os.path.join(args.out_dir, f"model_{args.alpha}_{i}.pkl"), "wb"
244
  ) as fpw:
245
  pickle.dump(model, fpw)
246
  np.savetxt(
@@ -283,12 +331,13 @@ def main():
283
 
284
  # assert the number of docs per language are same in embeddings and bag-of-words
285
  assert (
286
- bows[lang].shape[0] == doc_embs[lang].shape[0]
287
  ), "Number of docs in BoW ({:d}) != number of docs in embeddigs ({:d}) for language: {:s}".format(
288
  bows[lang].shape[0], doc_embs[lang].shape[0], lang
289
  )
290
 
291
  model = Model(vocab, emb_dim)
 
292
  for lang, bow in bows.items():
293
  model.init_bias_with_log_unigram_dist(bow, lang)
294
 
@@ -304,7 +353,7 @@ def main():
304
  model, llhs = train(model, bows, doc_embs, args)
305
 
306
  with open(
307
- os.path.join(args.out_dir, f"model_{args.alpha}_{args.epochs}.pkl"), "wb"
308
  ) as fpw:
309
  pickle.dump(model, fpw)
310
 
@@ -317,7 +366,6 @@ def main():
317
 
318
 
319
  def parse_arguments():
320
-
321
  parser = argparse.ArgumentParser(
322
  description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
323
  )
 
8
  from scipy.special import log_softmax
9
  from time import time
10
  from packaging import version
11
+ import torch
12
 
13
  assert version.parse(scipy.__version__) >= version.parse(
14
  "1.7.0"
15
  ), f"Requries scipy > 1.7.0. Found {scipy.__version__}"
16
 
17
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
18
 
19
+ class Model(torch.nn.Module):
20
  """Model defintion, parameters and helper fucntions to compute log-likelihood"""
21
 
22
  def __init__(self, vocab: dict, emb_dim: int):
 
27
  emb_dim: embedding dimension, will be same across languages
28
  """
29
 
30
+ super().__init__()
31
+
32
  self.L = len(vocab)
33
  self.vocab = vocab
34
  self.emb_dim = emb_dim
35
 
36
  # word embeddings matrix / subspace for each language
37
+ # self.E = {} # torch.nn.ModuleDict
38
+ # self.E = torch.nn.ModuleDict()
39
+ self.E = torch.nn.ParameterDict()
40
 
41
  # bias vector for each language
42
+ # self.b = {} # torch.nn.ModuleDict
43
+ # self.b = torch.nn.ModuleDict()
44
+ self.b = torch.nn.ParameterDict()
45
 
46
  n1 = 1.0 / np.sqrt(emb_dim)
47
 
48
  # initialize word embeddings and bias vectors randomly
49
  for lang, vocab_size in vocab.items():
50
  n2 = 1.0 / np.sqrt(vocab_size)
51
+ # self.E[lang] = torch.nn.ParameterList(torch.from_numpy(np.random.uniform(-n2, n1, size=(vocab_size, emb_dim))))
52
+ self.E[lang] = torch.nn.Parameter(torch.Tensor(np.random.uniform(-n2, n1, size=(vocab_size, emb_dim))),
53
+ requires_grad=True).to(device)
54
+ self.b[lang] = torch.nn.Parameter(torch.Tensor(np.random.randn(vocab_size, 1) * 0.0001), requires_grad=True).to(device)
55
 
56
  def init_bias_with_log_unigram_dist(self, X, lang):
57
  """We will initialize the bias vector with log of unigram distribution over vocabulary.
 
66
  else:
67
  X = X.A + 1e-08 # to avoid any zeros
68
 
69
+ # self.b[lang][:, 0] = np.log(
70
+ # X.sum(axis=0) / X.sum()
71
+ # ) # we would like b to of size (W, 1)
72
+
73
+ b_copy = self.b[lang].clone()
74
+ b_copy[:, 0] = torch.from_numpy(np.log(X.sum(axis=0) / X.sum()))
75
+ self.b[lang] = torch.nn.Parameter(b_copy, requires_grad=True)
76
 
77
  def compute_log_thetas(self, lang: str, DE_lang: np.ndarray, sanity_check=False):
78
  """Compute log of thetas, where theta_d is the unigram distribution over document `d`
 
84
  DE_lang (np.ndarray): Document embeddings of language
85
  """
86
 
87
+ # mat = self.b[lang] + (self.E[lang] @ DE_lang) # shape is vocab_size x n_docs
88
+ mat = self.b[lang] + (self.E[lang].double() @ torch.from_numpy(DE_lang).double().to(device))
89
+ # mat = mat.detach()
90
+ # mat = mat.detach().T
91
  mat = mat.T # shape is D x W
92
 
93
  # log_norm = logsumexp(mat, axis=1)
94
  # log_thetas = mat - log_norm
95
 
96
  # the following single step is same the two above steps combined
97
+ log_thetas = log_softmax(mat.detach().numpy(), axis=1) # shape is n_docs x vocab_size
98
 
99
  if sanity_check:
100
  n_docs = DE_lang.shape[0]
 
131
  llh = (X * log_thetas).sum()
132
  else:
133
  # X is a scipy sparse matrix
134
+ # this is the tricky part in pytorch
135
+
136
+ coo = X.tocoo()
137
+
138
+ row_ixs = torch.LongTensor(coo.row).to(device)
139
+ col_ixs = torch.LongTensor(coo.col).to(device)
140
+ data = torch.FloatTensor(coo.data).to(device)
141
+
142
+ # llh = (X.multiply(log_thetas)).sum()
143
+
144
+ log_thetas_tensor = torch.from_numpy(log_thetas)
145
+
146
+ llh = (log_thetas_tensor[row_ixs, col_ixs] * data).sum()
147
+ # TODO row_ixs, col_ixs, data
148
 
149
+ return llh * (-1.0) # * -1.0 when using pytorch to get negative llh (loss)
150
 
151
 
152
  def gradients_WE(model, lang, DE_lang, X, alpha):
 
186
  ef_grads = np.zeros_like(model.E)
187
 
188
  tmp = (
189
+ X - np.multiply(X.sum(axis=1).reshape(-1, 1), np.exp(log_thetas))
190
  ).A # .A will convert matrix to np ndarray
191
+ # ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * model.E[lang]).sum(axis=1, keepdims=True)
192
+
193
+ m = model.E[lang].detach().numpy()
194
+ # ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * model.E[lang]).sum(axis=1, keepdims=True)
195
+ ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * m).sum(axis=1, keepdims=True)
196
 
197
  # Sanity check to see if gradients computed in both ways are numerically identical
198
  # print('- All close grad_E:', np.allclose(ef_grads, grads))
 
215
  """
216
 
217
  assert (
218
+ params.shape == gradient.shape
219
  ), "The params and gradient must have same shape, \
220
  ({:d}, {:d}) != ({:d} {:d})".format(
221
  *params.shape, *gradient.shape
222
  )
223
 
224
+ new_params = params.detach() + (
225
+ learning_rate * gradient
226
  ) # since we are doing gradient ascent
227
  return new_params
228
 
 
231
  """Training scheme for the model"""
232
 
233
  print("\nTraining started ..")
234
+ optim = torch.optim.Adam(model.parameters(), lr=args.lr)
235
  learning_rate = args.lr
236
  llh_0 = 0.0
237
  for lang, X in bow.items():
 
244
 
245
  llh_ei = 0.0
246
  for lang, X in bow.items():
247
+ # for pytorch
248
+ optim.zero_grad()
249
+ # get row_ixs, col_ixs, data from X
250
 
251
+ # compute neg llh
252
+ #loss = torch.tensor(llh_ei, requires_grad=True)
253
+ #loss = torch.as_tensor(llh_ei).detach().clone()
254
  # update word embeddings E for lang, by keeping doc-embeddings A fixed
255
  grad_E = gradients_WE(model, lang, DE[lang].T, X, args.alpha)
256
 
 
258
 
259
  llh_ei += model.compute_log_likelihood(lang, DE[lang].T, X)
260
 
261
+ loss = torch.tensor(llh_ei, requires_grad=True)
262
+ loss.backward()
263
+
264
+ optim.step()
265
+
266
+
267
+
268
  print(
269
  "Epoch {:4d} / {:4d} | Log-likelihood: {:16.2f} | Learning rate: {:f}".format(
270
  i, args.epochs, llh_ei, learning_rate
 
277
  "Instead it decreased, which means the updates have overshooted.",
278
  "Halving the learning_rate.",
279
  )
280
+ #learning_rate = learning_rate * 0.5
281
 
282
  llhs.append(llh_ei)
283
 
284
+ # ylearning_rate scheduler
285
  # we reduce the learning_rate by 10 % after every 10 epochs
286
+ if i % 10 == 0:
287
+ print("Reducing the learning by a factor of 0.1 every 10 epcohs")
288
+ learning_rate -= learning_rate * 0.1
289
  if i % 100 == 0:
290
  with open(
291
+ os.path.join(args.out_dir, f"model_{args.alpha}_{i}.pkl"), "wb"
292
  ) as fpw:
293
  pickle.dump(model, fpw)
294
  np.savetxt(
 
331
 
332
  # assert the number of docs per language are same in embeddings and bag-of-words
333
  assert (
334
+ bows[lang].shape[0] == doc_embs[lang].shape[0]
335
  ), "Number of docs in BoW ({:d}) != number of docs in embeddigs ({:d}) for language: {:s}".format(
336
  bows[lang].shape[0], doc_embs[lang].shape[0], lang
337
  )
338
 
339
  model = Model(vocab, emb_dim)
340
+ model.to(device)
341
  for lang, bow in bows.items():
342
  model.init_bias_with_log_unigram_dist(bow, lang)
343
 
 
353
  model, llhs = train(model, bows, doc_embs, args)
354
 
355
  with open(
356
+ os.path.join(args.out_dir, f"model_{args.alpha}_{args.epochs}.pkl"), "wb"
357
  ) as fpw:
358
  pickle.dump(model, fpw)
359
 
 
366
 
367
 
368
  def parse_arguments():
 
369
  parser = argparse.ArgumentParser(
370
  description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
371
  )
topic_discovery/.DS_Store CHANGED
Binary files a/topic_discovery/.DS_Store and b/topic_discovery/.DS_Store differ
 
topic_discovery/{cvect_25000_ar.pkl β†’ cvect_100000_ar.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b37e9e016646662718993e2368f9e88c4c21141f8944f23449f27c6d59e03221
3
- size 3047285
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc2a5a244e0554ffd18efee81ece7f4997136eb7caa6fdaa142e44e264983291
3
+ size 3232315
topic_discovery/{cvect_25000_bn.pkl β†’ cvect_100000_bn.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05b3adf720d522a38762fda2bb6da2c948389a437b2138004698d326181d971d
3
- size 157149
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f27fa832a05cf8d6b2389a83e490526bf54763940b49f817f3ace830b332200c
3
+ size 125341
topic_discovery/{cvect_25000_de.pkl β†’ cvect_100000_de.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e551d8934e6a8e23c841437805bbed1b0e17eb2f3ab3e260b9104c1e30f452ad
3
- size 2037400
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ea2571476bac39c97563a7d2ec94879de43abd9877edbbadb76dcda600167f5
3
+ size 1869324
topic_discovery/{cvect_25000_el.pkl β†’ cvect_100000_el.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5419f509f5666ae55a7f5cdfb1cf7ea41f3fa102ec639c19c4aeea8b2dffe32
3
- size 3681045
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10fa08af8eccba5723694050feffdc7fdc7c698d7b62502b499534e7493d8ab1
3
+ size 4068227
topic_discovery/cvect_100000_en.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25da04442b2ff7ffae9bbe69f09e464aadf4a389c70fe6f283e9b0309d636a81
3
+ size 5019023
topic_discovery/cvect_100000_es.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70e1d60edf8e6b09736968209cf476fd60db5b12e3ad221593ac02061ec13307
3
+ size 5396926
topic_discovery/cvect_100000_fr.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34af4a4569ccd5e82c9218a288fe92091d3d7444f3b3a570707dd03ad464150c
3
+ size 3513313
topic_discovery/cvect_100000_it.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2135701318838fcee53219515e369ef7b2cb9451884b18e734ee840372f34bd7
3
+ size 2810588
topic_discovery/cvect_100000_jp.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dbb2536a0f269c7ab9e71f38cffb69f3fd5925d7b43eabcdc5050f1bea5b6f9
3
+ size 3040253
topic_discovery/cvect_100000_mk.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3ef12ac4c0952ca01c9191cb78c2e63e80b77c8a1faaae05133ff23ad26f161
3
+ size 1931126
topic_discovery/cvect_100000_nl.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39d3baf77620a5b2a4c8084af2d442595a968cad90b05653fc328870171e3733
3
+ size 1159719
topic_discovery/cvect_100000_pl.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcde383c79f519a4bce289379008b31fd35598df920cb6ad55fa3a2aa305a56d
3
+ size 1981167
topic_discovery/cvect_100000_pt.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a363d8aa2f5b1ef3877d429e02bd7823374989d5f10ed9814c574860e317b698
3
+ size 2068800
topic_discovery/cvect_100000_ru.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6beff3c5bd38755ac0ab8d34b6d86589d2777ed9361f0c59cdda9e7d04ef6031
3
+ size 6251799
topic_discovery/cvect_100000_zhs.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4d1f8f6d9b0d37d6a2c03df52711d4de319a74ef6b0d5229e2fe7f7115f3fe6
3
+ size 9212102
topic_discovery/cvect_100000_zht.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34538fb71bf08905b8461484e152c2f950515c03e8ffc0e427b1ec1db5ee3406
3
+ size 9724371
topic_discovery/cvect_25000_en.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb0ee36e4ef6738d408e30132c5d970be2e05728c305fccce06dc67b3941bea2
3
- size 4143980
 
 
 
 
topic_discovery/cvect_25000_es.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d28eb842e6f4717a791de9c8c61014131dbea8d26f84f90c62cd54b05595a1c9
3
- size 4235561
 
 
 
 
topic_discovery/cvect_25000_fr.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:74ff26b2269c2033f78ecb1e5870c449423d42d668975e5e98e899b6d2489f64
3
- size 2967490
 
 
 
 
topic_discovery/cvect_25000_it.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e8892d88fd88e0d9e121e57e1b77810e47d34909944b2e65e2094d426f17daa
3
- size 2477565
 
 
 
 
topic_discovery/cvect_25000_jp.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c075e83209a4a23afe290aef6a301717f4eadfd118a278114ea142fdf882c20
3
- size 3082086
 
 
 
 
topic_discovery/cvect_25000_mg.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:958dd98498097b8463b1fbc6f068b512650d40397b9e53659dc2238032126181
3
- size 3643714
 
 
 
 
topic_discovery/cvect_25000_mk.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6758e48f3626b7c91b7359097d27aedb6beaeb36c6a6632901c3fae3f6da5ea3
3
- size 2152452
 
 
 
 
topic_discovery/cvect_25000_nl.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f81d4942757d07cde33715cd00fe150c377b19070f57cc992230b8c6eeacb06
3
- size 1466263
 
 
 
 
topic_discovery/cvect_25000_pl.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad1d1d8853aa424ba47c81d52ab6fdd708d1a440901652d680482d092a88a44a
3
- size 2063425
 
 
 
 
topic_discovery/cvect_25000_pt.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:baef6e3fe017ed4feb3ac2e08701b77b4425ade9f39d700ab3d1b4a2d89059d6
3
- size 2001188
 
 
 
 
topic_discovery/cvect_25000_ru.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:89bfa381364b0df772b0a181df8740bf597733e328410c464e6690d58e8e212f
3
- size 5482015
 
 
 
 
topic_discovery/cvect_25000_zhs.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1369c082d071340da56006eef8ffc380625c39fef4a7034b7d1e2927b1f54717
3
- size 9390903
 
 
 
 
topic_discovery/cvect_25000_zht.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:030a1c4b66cfecf4645de14f77d90d56886e8927225581c94e45a93006c0c633
3
- size 9965443
 
 
 
 
topic_discovery/cvects.key CHANGED
@@ -1,17 +1,17 @@
1
- en topic_discovery/cvect_25000_en.pkl
2
- es topic_discovery/cvect_25000_es.pkl
3
- fr topic_discovery/cvect_25000_fr.pkl
4
- mg topic_discovery/cvect_25000_mg.pkl
5
- it topic_discovery/cvect_25000_it.pkl
6
- el topic_discovery/cvect_25000_el.pkl
7
- zhs topic_discovery/cvect_25000_zhs.pkl
8
- zht topic_discovery/cvect_25000_zht.pkl
9
- bn topic_discovery/cvect_25000_bn.pkl
10
- ru topic_discovery/cvect_25000_ru.pkl
11
- pt topic_discovery/cvect_25000_pt.pkl
12
- ar topic_discovery/cvect_25000_ar.pkl
13
- de topic_discovery/cvect_25000_de.pkl
14
- jp topic_discovery/cvect_25000_jp.pkl
15
- mk topic_discovery/cvect_25000_mk.pkl
16
- pl topic_discovery/cvect_25000_pl.pkl
17
- nl topic_discovery/cvect_25000_nl.pkl
 
1
+ en topic_discovery/cvect_100000_en.pkl
2
+ es topic_discovery/cvect_100000_es.pkl
3
+ fr topic_discovery/cvect_100000_fr.pkl
4
+ mg topic_discovery/cvect_100000_mg.pkl
5
+ it topic_discovery/cvect_100000_it.pkl
6
+ el topic_discovery/cvect_100000_el.pkl
7
+ zhs topic_discovery/cvect_100000_zhs.pkl
8
+ zht topic_discovery/cvect_100000_zht.pkl
9
+ bn topic_discovery/cvect_100000_bn.pkl
10
+ ru topic_discovery/cvect_100000_ru.pkl
11
+ pt topic_discovery/cvect_100000_pt.pkl
12
+ ar topic_discovery/cvect_100000_ar.pkl
13
+ de topic_discovery/cvect_100000_de.pkl
14
+ jp topic_discovery/cvect_100000_jp.pkl
15
+ mk topic_discovery/cvect_100000_mk.pkl
16
+ pl topic_discovery/cvect_100000_pl.pkl
17
+ nl topic_discovery/cvect_100000_nl.pkl