mtasic85 commited on
Commit
80f3ec1
1 Parent(s): 854297e

fixed smaller pretrain dataset

Browse files
Files changed (1) hide show
  1. scripts/prepare_pretrain_dataset.py +17 -11
scripts/prepare_pretrain_dataset.py CHANGED
@@ -20,7 +20,7 @@ def batch_iterator(name=None):
20
  for d in dataset:
21
  for row in d:
22
  for n in row:
23
- yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
24
 
25
  del dataset
26
  gc.collect()
@@ -79,7 +79,7 @@ def batch_iterator(name=None):
79
  dataset = load_dataset(name, split='train')
80
 
81
  for row in dataset:
82
- yield row['prompt'] + '\n' + row['response']
83
 
84
  del dataset
85
  gc.collect()
@@ -120,7 +120,7 @@ def batch_iterator(name=None):
120
  dataset = load_dataset(name, split='train')
121
 
122
  for row in dataset:
123
- yield row['query'] + '\n' + row['answer']
124
 
125
  del dataset
126
  gc.collect()
@@ -130,7 +130,7 @@ def batch_iterator(name=None):
130
  dataset = load_dataset(name, split='train')
131
 
132
  for row in dataset:
133
- yield row['instruction'] + '\n' + row['output']
134
 
135
  del dataset
136
  gc.collect()
@@ -140,7 +140,7 @@ def batch_iterator(name=None):
140
  dataset = load_dataset(name, split='train')
141
 
142
  for row in dataset:
143
- yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
144
 
145
  del dataset
146
  gc.collect()
@@ -150,7 +150,7 @@ def batch_iterator(name=None):
150
  dataset = load_dataset(name, split='train')
151
 
152
  for row in dataset:
153
- yield row['prompt'] + '\n' + row['completion']
154
 
155
  del dataset
156
  gc.collect()
@@ -171,7 +171,7 @@ def batch_iterator(name=None):
171
  dataset = load_dataset(name, split=split)
172
 
173
  for row in dataset:
174
- yield row['question'] + '\n' + row['answer']
175
 
176
  del dataset
177
  gc.collect()
@@ -181,7 +181,7 @@ def batch_iterator(name=None):
181
  dataset = load_dataset(name, split='train')
182
 
183
  for row in dataset:
184
- yield row['instruction'] + '\n' + row['output']
185
 
186
  del dataset
187
  gc.collect()
@@ -191,7 +191,7 @@ def batch_iterator(name=None):
191
  dataset = load_dataset(name, split='train')
192
 
193
  for row in dataset:
194
- yield row['question'] + '\n' + row['answer']
195
 
196
  del dataset
197
  gc.collect()
@@ -201,7 +201,7 @@ def batch_iterator(name=None):
201
  dataset = load_dataset(name, split='train')
202
 
203
  for row in dataset:
204
- yield row['question_translated_srb'] + '\n' + row['answer_translated_srb']
205
 
206
  del dataset
207
  gc.collect()
@@ -211,7 +211,13 @@ def batch_iterator(name=None):
211
  dataset = load_dataset(name, split='train')
212
 
213
  for row in dataset:
214
- yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
 
 
 
 
 
 
215
 
216
  del dataset
217
  gc.collect()
 
20
  for d in dataset:
21
  for row in d:
22
  for n in row:
23
+ yield row['instruction'] + ' ' + row['input'] + ' ' + row['output']
24
 
25
  del dataset
26
  gc.collect()
 
79
  dataset = load_dataset(name, split='train')
80
 
81
  for row in dataset:
82
+ yield row['prompt'] + ' ' + row['response']
83
 
84
  del dataset
85
  gc.collect()
 
120
  dataset = load_dataset(name, split='train')
121
 
122
  for row in dataset:
123
+ yield row['query'] + ' ' + row['answer']
124
 
125
  del dataset
126
  gc.collect()
 
130
  dataset = load_dataset(name, split='train')
131
 
132
  for row in dataset:
133
+ yield row['instruction'] + ' ' + row['output']
134
 
135
  del dataset
136
  gc.collect()
 
140
  dataset = load_dataset(name, split='train')
141
 
142
  for row in dataset:
143
+ yield row['instruction'] + ' ' + row['input'] + ' ' + row['output']
144
 
145
  del dataset
146
  gc.collect()
 
150
  dataset = load_dataset(name, split='train')
151
 
152
  for row in dataset:
153
+ yield row['prompt'] + ' ' + row['completion']
154
 
155
  del dataset
156
  gc.collect()
 
171
  dataset = load_dataset(name, split=split)
172
 
173
  for row in dataset:
174
+ yield row['question'] + ' ' + row['answer']
175
 
176
  del dataset
177
  gc.collect()
 
181
  dataset = load_dataset(name, split='train')
182
 
183
  for row in dataset:
184
+ yield row['instruction'] + ' ' + row['output']
185
 
186
  del dataset
187
  gc.collect()
 
191
  dataset = load_dataset(name, split='train')
192
 
193
  for row in dataset:
194
+ yield row['question'] + ' ' + row['answer']
195
 
196
  del dataset
197
  gc.collect()
 
201
  dataset = load_dataset(name, split='train')
202
 
203
  for row in dataset:
204
+ yield row['question_translated_srb'] + ' ' + row['answer_translated_srb']
205
 
206
  del dataset
207
  gc.collect()
 
211
  dataset = load_dataset(name, split='train')
212
 
213
  for row in dataset:
214
+ yield (
215
+ row['character'] + ' ' +
216
+ row['unicode'] + ' ' +
217
+ row['short description'] + ' ' +
218
+ row['tags'] + ' ' +
219
+ row['LLM description']
220
+ )
221
 
222
  del dataset
223
  gc.collect()