Spaces:
Build error
Build error
uh oh
#4
by
nroggendorff
- opened
train.py
CHANGED
@@ -1,18 +1,14 @@
|
|
1 |
-
import os
|
2 |
-
from sys import exit
|
3 |
-
import torch
|
4 |
import trl
|
5 |
from transformers import (
|
6 |
-
AutoTokenizer, LlamaConfig,
|
7 |
-
PreTrainedTokenizerFast
|
8 |
)
|
9 |
from trl import SFTConfig, SFTTrainer
|
10 |
from datasets import load_dataset, Dataset
|
11 |
from tokenizers import ByteLevelBPETokenizer
|
12 |
from huggingface_hub import HfApi
|
13 |
-
from torch.utils.data import DataLoader
|
14 |
from itertools import islice
|
15 |
-
|
16 |
from logging import getLogger, StreamHandler, INFO
|
17 |
|
18 |
logger = getLogger(__name__)
|
@@ -132,7 +128,7 @@ def format_prompts(examples, tokenizer, is_instructional):
|
|
132 |
return {'text': tokenizer.code(texts)}
|
133 |
|
134 |
def create_model(tokenizer):
|
135 |
-
|
136 |
vocab_size=tokenizer.vocab_size,
|
137 |
hidden_size=config.FACTOR,
|
138 |
intermediate_size=config.FACTOR * 4,
|
@@ -147,10 +143,9 @@ def create_model(tokenizer):
|
|
147 |
eos_token_id=tokenizer.eos_token_id,
|
148 |
tie_word_embeddings=False,
|
149 |
)
|
150 |
-
return LlamaForCausalLM(
|
151 |
|
152 |
def train_model(model, tokenizer, dataset, push_to_hub, is_instructional):
|
153 |
-
config =
|
154 |
dataset = dataset.map(
|
155 |
lambda examples: format_prompts(examples, tokenizer, is_instructional),
|
156 |
batched=True,
|
|
|
|
|
|
|
|
|
1 |
import trl
|
2 |
from transformers import (
|
3 |
+
AutoTokenizer, LlamaConfig, LlamaForCausalLM,
|
4 |
+
PreTrainedTokenizerFast
|
5 |
)
|
6 |
from trl import SFTConfig, SFTTrainer
|
7 |
from datasets import load_dataset, Dataset
|
8 |
from tokenizers import ByteLevelBPETokenizer
|
9 |
from huggingface_hub import HfApi
|
|
|
10 |
from itertools import islice
|
11 |
+
|
12 |
from logging import getLogger, StreamHandler, INFO
|
13 |
|
14 |
logger = getLogger(__name__)
|
|
|
128 |
return {'text': tokenizer.code(texts)}
|
129 |
|
130 |
def create_model(tokenizer):
|
131 |
+
model_config = LlamaConfig(
|
132 |
vocab_size=tokenizer.vocab_size,
|
133 |
hidden_size=config.FACTOR,
|
134 |
intermediate_size=config.FACTOR * 4,
|
|
|
143 |
eos_token_id=tokenizer.eos_token_id,
|
144 |
tie_word_embeddings=False,
|
145 |
)
|
146 |
+
return LlamaForCausalLM(model_config)
|
147 |
|
148 |
def train_model(model, tokenizer, dataset, push_to_hub, is_instructional):
|
|
|
149 |
dataset = dataset.map(
|
150 |
lambda examples: format_prompts(examples, tokenizer, is_instructional),
|
151 |
batched=True,
|