Update README.md
Browse files
README.md
CHANGED
@@ -12,7 +12,9 @@ language:
|
|
12 |
|
13 |
# Model Info
|
14 |
|
15 |
-
This is a model that applies LLM2Vec to
|
|
|
|
|
16 |
|
17 |
## Model Details
|
18 |
|
@@ -32,26 +34,70 @@ This is a model that applies LLM2Vec to Swallow. Only the PEFT Adapter is distri
|
|
32 |
|
33 |
## Usage
|
34 |
|
35 |
-
- Please see [original LLM2Vec repo](https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp#usage)
|
36 |
|
37 |
## Training Details
|
38 |
|
39 |
### Training Data
|
40 |
|
41 |
-
- [Wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
|
44 |
#### Training Hyperparameter
|
45 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
- gradient_accumulation_steps: 1
|
47 |
-
- max_seq_length
|
48 |
-
- mask_token_type: "blank"
|
49 |
-
- mlm_probability: 0.2
|
50 |
- lora_r: 16
|
51 |
-
- torch_dtype "bfloat16"
|
52 |
-
- attn_implementation "flash_attention_2"
|
|
|
53 |
- bf16: true
|
54 |
- gradient_checkpointing: true
|
|
|
55 |
|
56 |
#### Accelerator Settings
|
57 |
- deepspeed_config:
|
|
|
12 |
|
13 |
# Model Info
|
14 |
|
15 |
+
This is a model that applies LLM2Vec to Llama-2. Only the PEFT Adapter is distributed.
|
16 |
+
LLM2Vec is fine-tuned on two tasks: MNTP and SimCSE, and this repository contains the results of applying SimCSE after MNTP.
|
17 |
+
For the MNTP Adapter, please refer to [this link](https://huggingface.co/uzabase/LLM2Vec-Llama-2-7b-hf-wikipedia-jp-mntp).
|
18 |
|
19 |
## Model Details
|
20 |
|
|
|
34 |
|
35 |
## Usage
|
36 |
|
37 |
+
- Please see [original LLM2Vec repo](https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse#usage)
|
38 |
|
39 |
## Training Details
|
40 |
|
41 |
### Training Data
|
42 |
|
43 |
+
- Make Corpus from SimCSE from [Wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia)
|
44 |
+
- Script for making SimCSE Corpus
|
45 |
+
```
|
46 |
+
import argparse
|
47 |
+
import random
|
48 |
+
import re
|
49 |
+
from pathlib import Path
|
50 |
+
from datasets import load_dataset
|
51 |
+
from tqdm import tqdm
|
52 |
+
|
53 |
+
def main(args):
|
54 |
+
random.seed(args.seed)
|
55 |
+
wiki_ds = load_dataset("wikimedia/wikipedia", "20231101.ja")
|
56 |
+
sampled_index = random.sample(range(len(wiki_ds["train"])), args.N)
|
57 |
+
sample_wiki = wiki_ds["train"][sampled_index]
|
58 |
+
output_texts = []
|
59 |
+
for title, text in tqdm(zip(sample_wiki["title"], sample_wiki["text"])):
|
60 |
+
output_texts.append(title)
|
61 |
+
sentences = re.split("[\n。]", text)
|
62 |
+
for sentence in sentences:
|
63 |
+
if len(sentence) > args.min_sentence_len:
|
64 |
+
output_texts.append(sentence.strip()+"。")
|
65 |
+
with args.output_path.open(mode="w") as f:
|
66 |
+
for line in output_texts:
|
67 |
+
f.write(line)
|
68 |
+
f.write("\n")
|
69 |
+
|
70 |
+
|
71 |
+
if __name__ == "__main__":
|
72 |
+
parser = argparse.ArgumentParser()
|
73 |
+
parser.add_argument("--N", default=200000, type=int)
|
74 |
+
parser.add_argument("--seed", default=42, type=int)
|
75 |
+
parser.add_argument("-o", "--output_path", type=Path)
|
76 |
+
parser.add_argument("--min_sentence_len", default=50, type=int)
|
77 |
+
|
78 |
+
args = parser.parse_args()
|
79 |
+
main(args)
|
80 |
+
```
|
81 |
+
|
82 |
|
83 |
|
84 |
#### Training Hyperparameter
|
85 |
+
- simcse_dropout: 0.3
|
86 |
+
- bidirectional: true
|
87 |
+
- pooling_mode: "mean"
|
88 |
+
- remove_unused_columns: false
|
89 |
+
- learning_rate: 3e-5
|
90 |
+
- loss_scale: 20
|
91 |
+
- batch_size: 256
|
92 |
- gradient_accumulation_steps: 1
|
93 |
+
- max_seq_length: 128
|
|
|
|
|
94 |
- lora_r: 16
|
95 |
+
- torch_dtype: "bfloat16"
|
96 |
+
- attn_implementation: "flash_attention_2"
|
97 |
+
- seed: 42
|
98 |
- bf16: true
|
99 |
- gradient_checkpointing: true
|
100 |
+
|
101 |
|
102 |
#### Accelerator Settings
|
103 |
- deepspeed_config:
|