--- license: cc0-1.0 datasets: - Hamses/EU_Regulation_261_2004 language: - en library_name: transformers pipeline_tag: text-generation tags: - legal --- pip install transformers datasets torch from datasets import load_dataset # Load your custom dataset (ensure it's in the proper format) dataset = load_dataset('Hamses/EU_Regulation_261_2004', data_files={'train': 'train.txt', 'test': 'test.txt'}) # Load the GPT-2 tokenizer from transformers import GPT2Tokenizer tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Preprocess the dataset def preprocess_function(examples): return tokenizer(examples['text'], padding='max_length', truncation=True) encoded_dataset = dataset.map(preprocess_function, batched=True) from transformers import GPT2LMHeadModel, TrainingArguments, Trainer # Load the GPT-2 model model = GPT2LMHeadModel.from_pretrained('gpt2') # Define training arguments training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=4, per_device_eval_batch_size=4, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', ) # Initialize the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=encoded_dataset['train'], eval_dataset=encoded_dataset['test'] ) # Train the model trainer.train() # Evaluate the model results = trainer.evaluate() print(results) # Save the model model.save_pretrained('./gpt2-finetuned') tokenizer.save_pretrained('./gpt2-finetuned')