96abhishekarora
commited on
Commit
•
6fa1fd1
1
Parent(s):
ee998a0
Modified validation and training for linktransformer model
Browse files- .gitattributes +3 -0
- 1_Pooling/config.json +3 -1
- LT_training_config.json +15 -7
- README.md +38 -16
- config.json +2 -2
- model.safetensors +3 -0
- special_tokens_map.json +42 -6
- tokenizer.json +2 -2
- tokenizer_config.json +50 -8
.gitattributes
CHANGED
@@ -35,3 +35,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
37 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
37 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
39 |
+
.git/lfs/objects/bf/e9/bfe950b415bc9506b72a2f73eed59afd4840841deae18772931c069d89d51f23 filter=lfs diff=lfs merge=lfs -text
|
40 |
+
.git/lfs/objects/b6/0b/b60b6b43406a48bf3638526314f3d232d97058bc93472ff2de930d43686fa441 filter=lfs diff=lfs merge=lfs -text
|
1_Pooling/config.json
CHANGED
@@ -3,5 +3,7 @@
|
|
3 |
"pooling_mode_cls_token": false,
|
4 |
"pooling_mode_mean_tokens": true,
|
5 |
"pooling_mode_max_tokens": false,
|
6 |
-
"pooling_mode_mean_sqrt_len_tokens": false
|
|
|
|
|
7 |
}
|
|
|
3 |
"pooling_mode_cls_token": false,
|
4 |
"pooling_mode_mean_tokens": true,
|
5 |
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false
|
9 |
}
|
LT_training_config.json
CHANGED
@@ -1,19 +1,26 @@
|
|
1 |
{
|
2 |
"model_save_dir": "models",
|
3 |
"model_save_name": "linkage_multi_aliases",
|
4 |
-
"opt_model_description": "This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework. \n It was trained for
|
5 |
"opt_model_lang": [
|
|
|
6 |
"en",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
"es",
|
8 |
"fr",
|
9 |
-
"
|
10 |
-
"ja",
|
11 |
-
"zh"
|
12 |
],
|
13 |
"train_batch_size": 64,
|
14 |
-
"num_epochs":
|
15 |
"warm_up_perc": 1,
|
16 |
-
"learning_rate": 2e-
|
|
|
17 |
"val_perc": 0.2,
|
18 |
"wandb_names": {
|
19 |
"project": "linkage",
|
@@ -23,10 +30,11 @@
|
|
23 |
},
|
24 |
"add_pooling_layer": false,
|
25 |
"large_val": true,
|
26 |
-
"eval_steps_perc": 0.
|
27 |
"test_at_end": true,
|
28 |
"save_val_test_pickles": true,
|
29 |
"val_query_prop": 0.5,
|
|
|
30 |
"eval_type": "retrieval",
|
31 |
"training_dataset": "dataframe",
|
32 |
"base_model_path": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
|
1 |
{
|
2 |
"model_save_dir": "models",
|
3 |
"model_save_name": "linkage_multi_aliases",
|
4 |
+
"opt_model_description": "This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework. \n It was trained for 70 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json \n ",
|
5 |
"opt_model_lang": [
|
6 |
+
"de",
|
7 |
"en",
|
8 |
+
"zh",
|
9 |
+
"ja",
|
10 |
+
"hi",
|
11 |
+
"ar",
|
12 |
+
"bn",
|
13 |
+
"pt",
|
14 |
+
"ru",
|
15 |
"es",
|
16 |
"fr",
|
17 |
+
"ko"
|
|
|
|
|
18 |
],
|
19 |
"train_batch_size": 64,
|
20 |
+
"num_epochs": 70,
|
21 |
"warm_up_perc": 1,
|
22 |
+
"learning_rate": 2e-05,
|
23 |
+
"loss_type": "supcon",
|
24 |
"val_perc": 0.2,
|
25 |
"wandb_names": {
|
26 |
"project": "linkage",
|
|
|
30 |
},
|
31 |
"add_pooling_layer": false,
|
32 |
"large_val": true,
|
33 |
+
"eval_steps_perc": 0.5,
|
34 |
"test_at_end": true,
|
35 |
"save_val_test_pickles": true,
|
36 |
"val_query_prop": 0.5,
|
37 |
+
"loss_params": {},
|
38 |
"eval_type": "retrieval",
|
39 |
"training_dataset": "dataframe",
|
40 |
"base_model_path": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
README.md
CHANGED
@@ -1,12 +1,18 @@
|
|
1 |
---
|
2 |
pipeline_tag: sentence-similarity
|
3 |
language:
|
|
|
4 |
- en
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
- es
|
6 |
- fr
|
7 |
-
-
|
8 |
-
- ja
|
9 |
-
- zh
|
10 |
tags:
|
11 |
- linktransformer
|
12 |
- sentence-transformers
|
@@ -17,23 +23,29 @@ tags:
|
|
17 |
|
18 |
# dell-research-harvard/lt-wikidata-comp-multi
|
19 |
|
20 |
-
This is a [LinkTransformer](https://github.
|
21 |
It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
|
22 |
Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
|
23 |
It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
24 |
Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
|
25 |
|
26 |
|
27 |
-
This model has been fine-tuned on the model : sentence-transformers/paraphrase-multilingual-mpnet-base-v2. It is pretrained for the language : -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
- es
|
29 |
- fr
|
30 |
-
-
|
31 |
-
- ja
|
32 |
-
- zh.
|
33 |
|
34 |
|
35 |
This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework.
|
36 |
-
It was trained for
|
37 |
|
38 |
|
39 |
## Usage (LinkTransformer)
|
@@ -107,7 +119,7 @@ The model was trained with the parameters:
|
|
107 |
|
108 |
**DataLoader**:
|
109 |
|
110 |
-
`torch.utils.data.dataloader.DataLoader` of length
|
111 |
```
|
112 |
{'batch_size': 64, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
|
113 |
```
|
@@ -119,17 +131,17 @@ The model was trained with the parameters:
|
|
119 |
Parameters of the fit()-Method:
|
120 |
```
|
121 |
{
|
122 |
-
"epochs":
|
123 |
-
"evaluation_steps":
|
124 |
"evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
|
125 |
"max_grad_norm": 1,
|
126 |
"optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
|
127 |
"optimizer_params": {
|
128 |
-
"lr": 2e-
|
129 |
},
|
130 |
"scheduler": "WarmupLinear",
|
131 |
"steps_per_epoch": null,
|
132 |
-
"warmup_steps":
|
133 |
"weight_decay": 0.01
|
134 |
}
|
135 |
```
|
@@ -139,10 +151,20 @@ Parameters of the fit()-Method:
|
|
139 |
|
140 |
LinkTransformer(
|
141 |
(0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
|
142 |
-
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
|
143 |
)
|
144 |
```
|
145 |
|
146 |
## Citing & Authors
|
147 |
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
pipeline_tag: sentence-similarity
|
3 |
language:
|
4 |
+
- de
|
5 |
- en
|
6 |
+
- zh
|
7 |
+
- ja
|
8 |
+
- hi
|
9 |
+
- ar
|
10 |
+
- bn
|
11 |
+
- pt
|
12 |
+
- ru
|
13 |
- es
|
14 |
- fr
|
15 |
+
- ko
|
|
|
|
|
16 |
tags:
|
17 |
- linktransformer
|
18 |
- sentence-transformers
|
|
|
23 |
|
24 |
# dell-research-harvard/lt-wikidata-comp-multi
|
25 |
|
26 |
+
This is a [LinkTransformer](https://linktransformer.github.io/) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
|
27 |
It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
|
28 |
Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
|
29 |
It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
30 |
Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
|
31 |
|
32 |
|
33 |
+
This model has been fine-tuned on the model : sentence-transformers/paraphrase-multilingual-mpnet-base-v2. It is pretrained for the language : - de
|
34 |
+
- en
|
35 |
+
- zh
|
36 |
+
- ja
|
37 |
+
- hi
|
38 |
+
- ar
|
39 |
+
- bn
|
40 |
+
- pt
|
41 |
+
- ru
|
42 |
- es
|
43 |
- fr
|
44 |
+
- ko.
|
|
|
|
|
45 |
|
46 |
|
47 |
This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework.
|
48 |
+
It was trained for 70 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json
|
49 |
|
50 |
|
51 |
## Usage (LinkTransformer)
|
|
|
119 |
|
120 |
**DataLoader**:
|
121 |
|
122 |
+
`torch.utils.data.dataloader.DataLoader` of length 5966 with parameters:
|
123 |
```
|
124 |
{'batch_size': 64, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
|
125 |
```
|
|
|
131 |
Parameters of the fit()-Method:
|
132 |
```
|
133 |
{
|
134 |
+
"epochs": 70,
|
135 |
+
"evaluation_steps": 2983,
|
136 |
"evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
|
137 |
"max_grad_norm": 1,
|
138 |
"optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
|
139 |
"optimizer_params": {
|
140 |
+
"lr": 2e-05
|
141 |
},
|
142 |
"scheduler": "WarmupLinear",
|
143 |
"steps_per_epoch": null,
|
144 |
+
"warmup_steps": 417620,
|
145 |
"weight_decay": 0.01
|
146 |
}
|
147 |
```
|
|
|
151 |
|
152 |
LinkTransformer(
|
153 |
(0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
|
154 |
+
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
|
155 |
)
|
156 |
```
|
157 |
|
158 |
## Citing & Authors
|
159 |
|
160 |
+
```
|
161 |
+
@misc{arora2023linktransformer,
|
162 |
+
title={LinkTransformer: A Unified Package for Record Linkage with Transformer Language Models},
|
163 |
+
author={Abhishek Arora and Melissa Dell},
|
164 |
+
year={2023},
|
165 |
+
eprint={2309.00789},
|
166 |
+
archivePrefix={arXiv},
|
167 |
+
primaryClass={cs.CL}
|
168 |
+
}
|
169 |
+
|
170 |
+
```
|
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "models/linkage_multi_aliases
|
3 |
"architectures": [
|
4 |
"XLMRobertaModel"
|
5 |
],
|
@@ -22,7 +22,7 @@
|
|
22 |
"pad_token_id": 1,
|
23 |
"position_embedding_type": "absolute",
|
24 |
"torch_dtype": "float32",
|
25 |
-
"transformers_version": "4.
|
26 |
"type_vocab_size": 1,
|
27 |
"use_cache": true,
|
28 |
"vocab_size": 250002
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "models/linkage_multi_aliases",
|
3 |
"architectures": [
|
4 |
"XLMRobertaModel"
|
5 |
],
|
|
|
22 |
"pad_token_id": 1,
|
23 |
"position_embedding_type": "absolute",
|
24 |
"torch_dtype": "float32",
|
25 |
+
"transformers_version": "4.35.1",
|
26 |
"type_vocab_size": 1,
|
27 |
"use_cache": true,
|
28 |
"vocab_size": 250002
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3d23a8c3194fddc392984009fb03fbfd1ce072746d1ced463d0feb407bd0059
|
3 |
+
size 1112197096
|
special_tokens_map.json
CHANGED
@@ -1,7 +1,25 @@
|
|
1 |
{
|
2 |
-
"bos_token":
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
"mask_token": {
|
6 |
"content": "<mask>",
|
7 |
"lstrip": true,
|
@@ -9,7 +27,25 @@
|
|
9 |
"rstrip": false,
|
10 |
"single_word": false
|
11 |
},
|
12 |
-
"pad_token":
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
}
|
|
|
1 |
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
"mask_token": {
|
24 |
"content": "<mask>",
|
25 |
"lstrip": true,
|
|
|
27 |
"rstrip": false,
|
28 |
"single_word": false
|
29 |
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "</s>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
}
|
tokenizer.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c835b069d7b8cd02b400e6247b83bc1840ab12bb1628d5b2e03c8d728de75558
|
3 |
+
size 17082941
|
tokenizer_config.json
CHANGED
@@ -1,19 +1,61 @@
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"bos_token": "<s>",
|
3 |
"clean_up_tokenization_spaces": true,
|
4 |
"cls_token": "<s>",
|
5 |
"eos_token": "</s>",
|
6 |
-
"mask_token":
|
7 |
-
|
8 |
-
"content": "<mask>",
|
9 |
-
"lstrip": true,
|
10 |
-
"normalized": true,
|
11 |
-
"rstrip": false,
|
12 |
-
"single_word": false
|
13 |
-
},
|
14 |
"model_max_length": 512,
|
|
|
15 |
"pad_token": "<pad>",
|
|
|
|
|
16 |
"sep_token": "</s>",
|
|
|
17 |
"tokenizer_class": "XLMRobertaTokenizer",
|
|
|
|
|
18 |
"unk_token": "<unk>"
|
19 |
}
|
|
|
1 |
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<unk>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"250001": {
|
36 |
+
"content": "<mask>",
|
37 |
+
"lstrip": true,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
"bos_token": "<s>",
|
45 |
"clean_up_tokenization_spaces": true,
|
46 |
"cls_token": "<s>",
|
47 |
"eos_token": "</s>",
|
48 |
+
"mask_token": "<mask>",
|
49 |
+
"max_length": 128,
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
"model_max_length": 512,
|
51 |
+
"pad_to_multiple_of": null,
|
52 |
"pad_token": "<pad>",
|
53 |
+
"pad_token_type_id": 0,
|
54 |
+
"padding_side": "right",
|
55 |
"sep_token": "</s>",
|
56 |
+
"stride": 0,
|
57 |
"tokenizer_class": "XLMRobertaTokenizer",
|
58 |
+
"truncation_side": "right",
|
59 |
+
"truncation_strategy": "longest_first",
|
60 |
"unk_token": "<unk>"
|
61 |
}
|