fixed smaller pretrain dataset
Browse files
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -20,7 +20,7 @@ def batch_iterator(name=None):
|
|
20 |
for d in dataset:
|
21 |
for row in d:
|
22 |
for n in row:
|
23 |
-
yield row['instruction'] + '
|
24 |
|
25 |
del dataset
|
26 |
gc.collect()
|
@@ -79,7 +79,7 @@ def batch_iterator(name=None):
|
|
79 |
dataset = load_dataset(name, split='train')
|
80 |
|
81 |
for row in dataset:
|
82 |
-
yield row['prompt'] + '
|
83 |
|
84 |
del dataset
|
85 |
gc.collect()
|
@@ -120,7 +120,7 @@ def batch_iterator(name=None):
|
|
120 |
dataset = load_dataset(name, split='train')
|
121 |
|
122 |
for row in dataset:
|
123 |
-
yield row['query'] + '
|
124 |
|
125 |
del dataset
|
126 |
gc.collect()
|
@@ -130,7 +130,7 @@ def batch_iterator(name=None):
|
|
130 |
dataset = load_dataset(name, split='train')
|
131 |
|
132 |
for row in dataset:
|
133 |
-
yield row['instruction'] + '
|
134 |
|
135 |
del dataset
|
136 |
gc.collect()
|
@@ -140,7 +140,7 @@ def batch_iterator(name=None):
|
|
140 |
dataset = load_dataset(name, split='train')
|
141 |
|
142 |
for row in dataset:
|
143 |
-
yield row['instruction'] + '
|
144 |
|
145 |
del dataset
|
146 |
gc.collect()
|
@@ -150,7 +150,7 @@ def batch_iterator(name=None):
|
|
150 |
dataset = load_dataset(name, split='train')
|
151 |
|
152 |
for row in dataset:
|
153 |
-
yield row['prompt'] + '
|
154 |
|
155 |
del dataset
|
156 |
gc.collect()
|
@@ -171,7 +171,7 @@ def batch_iterator(name=None):
|
|
171 |
dataset = load_dataset(name, split=split)
|
172 |
|
173 |
for row in dataset:
|
174 |
-
yield row['question'] + '
|
175 |
|
176 |
del dataset
|
177 |
gc.collect()
|
@@ -181,7 +181,7 @@ def batch_iterator(name=None):
|
|
181 |
dataset = load_dataset(name, split='train')
|
182 |
|
183 |
for row in dataset:
|
184 |
-
yield row['instruction'] + '
|
185 |
|
186 |
del dataset
|
187 |
gc.collect()
|
@@ -191,7 +191,7 @@ def batch_iterator(name=None):
|
|
191 |
dataset = load_dataset(name, split='train')
|
192 |
|
193 |
for row in dataset:
|
194 |
-
yield row['question'] + '
|
195 |
|
196 |
del dataset
|
197 |
gc.collect()
|
@@ -201,7 +201,7 @@ def batch_iterator(name=None):
|
|
201 |
dataset = load_dataset(name, split='train')
|
202 |
|
203 |
for row in dataset:
|
204 |
-
yield row['question_translated_srb'] + '
|
205 |
|
206 |
del dataset
|
207 |
gc.collect()
|
@@ -211,7 +211,13 @@ def batch_iterator(name=None):
|
|
211 |
dataset = load_dataset(name, split='train')
|
212 |
|
213 |
for row in dataset:
|
214 |
-
yield
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
del dataset
|
217 |
gc.collect()
|
|
|
20 |
for d in dataset:
|
21 |
for row in d:
|
22 |
for n in row:
|
23 |
+
yield row['instruction'] + ' ' + row['input'] + ' ' + row['output']
|
24 |
|
25 |
del dataset
|
26 |
gc.collect()
|
|
|
79 |
dataset = load_dataset(name, split='train')
|
80 |
|
81 |
for row in dataset:
|
82 |
+
yield row['prompt'] + ' ' + row['response']
|
83 |
|
84 |
del dataset
|
85 |
gc.collect()
|
|
|
120 |
dataset = load_dataset(name, split='train')
|
121 |
|
122 |
for row in dataset:
|
123 |
+
yield row['query'] + ' ' + row['answer']
|
124 |
|
125 |
del dataset
|
126 |
gc.collect()
|
|
|
130 |
dataset = load_dataset(name, split='train')
|
131 |
|
132 |
for row in dataset:
|
133 |
+
yield row['instruction'] + ' ' + row['output']
|
134 |
|
135 |
del dataset
|
136 |
gc.collect()
|
|
|
140 |
dataset = load_dataset(name, split='train')
|
141 |
|
142 |
for row in dataset:
|
143 |
+
yield row['instruction'] + ' ' + row['input'] + ' ' + row['output']
|
144 |
|
145 |
del dataset
|
146 |
gc.collect()
|
|
|
150 |
dataset = load_dataset(name, split='train')
|
151 |
|
152 |
for row in dataset:
|
153 |
+
yield row['prompt'] + ' ' + row['completion']
|
154 |
|
155 |
del dataset
|
156 |
gc.collect()
|
|
|
171 |
dataset = load_dataset(name, split=split)
|
172 |
|
173 |
for row in dataset:
|
174 |
+
yield row['question'] + ' ' + row['answer']
|
175 |
|
176 |
del dataset
|
177 |
gc.collect()
|
|
|
181 |
dataset = load_dataset(name, split='train')
|
182 |
|
183 |
for row in dataset:
|
184 |
+
yield row['instruction'] + ' ' + row['output']
|
185 |
|
186 |
del dataset
|
187 |
gc.collect()
|
|
|
191 |
dataset = load_dataset(name, split='train')
|
192 |
|
193 |
for row in dataset:
|
194 |
+
yield row['question'] + ' ' + row['answer']
|
195 |
|
196 |
del dataset
|
197 |
gc.collect()
|
|
|
201 |
dataset = load_dataset(name, split='train')
|
202 |
|
203 |
for row in dataset:
|
204 |
+
yield row['question_translated_srb'] + ' ' + row['answer_translated_srb']
|
205 |
|
206 |
del dataset
|
207 |
gc.collect()
|
|
|
211 |
dataset = load_dataset(name, split='train')
|
212 |
|
213 |
for row in dataset:
|
214 |
+
yield (
|
215 |
+
row['character'] + ' ' +
|
216 |
+
row['unicode'] + ' ' +
|
217 |
+
row['short description'] + ' ' +
|
218 |
+
row['tags'] + ' ' +
|
219 |
+
row['LLM description']
|
220 |
+
)
|
221 |
|
222 |
del dataset
|
223 |
gc.collect()
|