pip install transformers datasets torch
from datasets import load_dataset
Load your custom dataset (ensure it's in the proper format)
dataset = load_dataset('Hamses/EU_Regulation_261_2004', data_files={'train': 'train.txt', 'test': 'test.txt'})
Load the GPT-2 tokenizer
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
Preprocess the dataset
def preprocess_function(examples): return tokenizer(examples['text'], padding='max_length', truncation=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer
Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
Define training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
)
Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=encoded_dataset['train'],
eval_dataset=encoded_dataset['test']
)
Train the model
trainer.train()
Evaluate the model
results = trainer.evaluate() print(results)
Save the model
model.save_pretrained('./gpt2-finetuned') tokenizer.save_pretrained('./gpt2-finetuned')