from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments from datasets import load_dataset # Load dataset - CodeParrot is a good example dataset dataset = load_dataset('codeparrot/code-to-text') # Load pre-trained model and tokenizer model = GPT2LMHeadModel.from_pretrained('gpt2-medium') tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') # Tokenize dataset def tokenize_function(examples): return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512) tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['code']) # Training arguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=5e-5, per_device_train_batch_size=4, per_device_eval_batch_size=4, num_train_epochs=3, weight_decay=0.01, push_to_hub=True, hub_model_id='dnnsdunca/UANN', hub_token='YOUR_HUGGINGFACE_TOKEN' ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['validation'], ) # Train model trainer.train() # Save the model model.save_pretrained('./codegen_model') tokenizer.save_pretrained('./codegen_model')