Update README.md
Browse filesUpdated sample code
README.md
CHANGED
@@ -33,15 +33,16 @@ The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtai
|
|
33 |
|
34 |
```python
|
35 |
|
36 |
-
#pip install tokenizers==0.10.3 transformers==4.8.0
|
37 |
|
38 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
39 |
|
|
|
|
|
40 |
tokenizer = AutoTokenizer.from_pretrained("Norod78/distilgpt2-base-pretrained-he")
|
41 |
model = AutoModelForCausalLM.from_pretrained("Norod78/distilgpt2-base-pretrained-he", pad_token_id=tokenizer.eos_token_id)
|
42 |
|
43 |
-
prompt_text = "
|
44 |
-
max_len =
|
45 |
sample_output_num = 3
|
46 |
seed = 1000
|
47 |
|
@@ -80,10 +81,7 @@ if input_ids != None:
|
|
80 |
print("Updated max_len = " + str(max_len))
|
81 |
|
82 |
stop_token = "<|endoftext|>"
|
83 |
-
new_lines = "\
|
84 |
-
\
|
85 |
-
\
|
86 |
-
"
|
87 |
|
88 |
sample_outputs = model.generate(
|
89 |
input_ids,
|
@@ -94,9 +92,7 @@ sample_outputs = model.generate(
|
|
94 |
num_return_sequences=sample_output_num
|
95 |
)
|
96 |
|
97 |
-
print(100 * '-' + "\
|
98 |
-
\t\tOutput\
|
99 |
-
" + 100 * '-')
|
100 |
for i, sample_output in enumerate(sample_outputs):
|
101 |
|
102 |
text = tokenizer.decode(sample_output, skip_special_tokens=True)
|
@@ -107,9 +103,7 @@ for i, sample_output in enumerate(sample_outputs):
|
|
107 |
# Remove all text after 3 newlines
|
108 |
text = text[: text.find(new_lines) if new_lines else None]
|
109 |
|
110 |
-
print("\
|
111 |
-
|
112 |
-
print("\
|
113 |
-
" + 100 * '-')
|
114 |
|
115 |
```
|
|
|
33 |
|
34 |
```python
|
35 |
|
|
|
36 |
|
37 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
38 |
|
39 |
+
#pip install tokenizers==0.10.3 transformers==4.8.0
|
40 |
+
|
41 |
tokenizer = AutoTokenizer.from_pretrained("Norod78/distilgpt2-base-pretrained-he")
|
42 |
model = AutoModelForCausalLM.from_pretrained("Norod78/distilgpt2-base-pretrained-he", pad_token_id=tokenizer.eos_token_id)
|
43 |
|
44 |
+
prompt_text = "הנבחרת האולימפית של ישראל זכתה השנה"
|
45 |
+
max_len = 50
|
46 |
sample_output_num = 3
|
47 |
seed = 1000
|
48 |
|
|
|
81 |
print("Updated max_len = " + str(max_len))
|
82 |
|
83 |
stop_token = "<|endoftext|>"
|
84 |
+
new_lines = "\n\n\n"
|
|
|
|
|
|
|
85 |
|
86 |
sample_outputs = model.generate(
|
87 |
input_ids,
|
|
|
92 |
num_return_sequences=sample_output_num
|
93 |
)
|
94 |
|
95 |
+
print(100 * '-' + "\n\t\tOutput\n" + 100 * '-')
|
|
|
|
|
96 |
for i, sample_output in enumerate(sample_outputs):
|
97 |
|
98 |
text = tokenizer.decode(sample_output, skip_special_tokens=True)
|
|
|
103 |
# Remove all text after 3 newlines
|
104 |
text = text[: text.find(new_lines) if new_lines else None]
|
105 |
|
106 |
+
print("\n{}: {}".format(i, text))
|
107 |
+
print("\n" + 100 * '-')
|
|
|
|
|
108 |
|
109 |
```
|