File size: 2,960 Bytes
2032ac8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdf737a
 
 
2032ac8
bdf737a
 
 
 
 
 
2032ac8
bdf737a
 
 
2032ac8
bdf737a
 
2032ac8
bdf737a
 
 
2032ac8
bdf737a
 
 
 
2032ac8
bdf737a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2032ac8
bdf737a
2032ac8
bdf737a
 
 
 
 
 
 
2032ac8
bdf737a
 
2032ac8
 
bdf737a
 
 
2032ac8
 
bdf737a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import streamlit as st
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from torch.utils.data import DataLoader
import traceback

dir_path = os.path.abspath('./')
os.environ["HF_HOME"] = dir_path


def tokenize_function(examples):
    # Concatenate Instruction and Response
    combined_texts = [instr + " " + resp for instr, resp in zip(examples["Instruction"], examples["Response"])]
    # return tokenizer(combined_texts, padding="max_length", truncation=True)
    tokenized_inputs = tokenizer(combined_texts, padding="max_length", truncation=True, max_length=512)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs


st.write("Getting model and dataset ...")
# Load the dataset
dataset = load_dataset("viber1/indian-law-dataset", cache_dir=dir_path)

# Update this path based on where the tokenizer files are actually stored
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
# Load the model
model = AutoModelForCausalLM.from_pretrained('gpt2')
# model.gradient_checkpointing_enable()

st.write("Training setup ...")
# Apply the tokenizer to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split the dataset manually into train and validation sets
split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)

# Convert the dataset to PyTorch tensors
train_dataset = split_dataset["train"].with_format("torch")
eval_dataset = split_dataset["test"].with_format("torch")

# Create data loaders
# reduce batch size 8 to 1
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, pin_memory=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=8, pin_memory=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="{dir_path}/results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision
    # save_total_limit=2,
    logging_dir='{dir_path}/logs',  # Set logging directory
    logging_steps=5,  # Log more frequently
    # gradient_checkpointing=True,  # Enable gradient checkpointing
    # gradient_accumulation_steps=8  # Accumulate gradients over 8
)

st.write("Training Started .....")

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

try:
    trainer.train()
    # Evaluate the model
    st.write("Training Done ...")
    model.save_pretrained(f"{dir_path}\\trained-gpt2")
    tokenizer.save_pretrained(f"{dir_path}\\trained-gpt2")
    st.write("Evaluating Model ...")
    results = trainer.evaluate()
    st.write(results)
except Exception as e:
    st.write(f"Error: {e}")
    traceback.print_exc()
    st.write("some error")