vakodiya commited on
Commit
bdf737a
1 Parent(s): 2032ac8

Changes epochs to 1 and batch size to 8 and disabled gradient checkpoint

Browse files
Files changed (1) hide show
  1. app.py +54 -54
app.py CHANGED
@@ -7,7 +7,6 @@ import traceback
7
 
8
  dir_path = os.path.abspath('./')
9
  os.environ["HF_HOME"] = dir_path
10
- start_training = st.button("Train Model")
11
 
12
 
13
  def tokenize_function(examples):
@@ -19,69 +18,70 @@ def tokenize_function(examples):
19
  return tokenized_inputs
20
 
21
 
22
- if start_training:
23
- st.write("Getting model and dataset ...")
24
- # Load the dataset
25
- dataset = load_dataset("viber1/indian-law-dataset", cache_dir=dir_path)
26
 
27
- # Update this path based on where the tokenizer files are actually stored
28
- tokenizer = AutoTokenizer.from_pretrained('gpt2')
29
- tokenizer.pad_token = tokenizer.eos_token
30
- # Load the model
31
- model = AutoModelForCausalLM.from_pretrained('gpt2')
32
- model.gradient_checkpointing_enable()
33
 
34
- st.write("Training setup ...")
35
- # Apply the tokenizer to the dataset
36
- tokenized_dataset = dataset.map(tokenize_function, batched=True)
37
 
38
- # Split the dataset manually into train and validation sets
39
- split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)
40
 
41
- # Convert the dataset to PyTorch tensors
42
- train_dataset = split_dataset["train"].with_format("torch")
43
- eval_dataset = split_dataset["test"].with_format("torch")
44
 
45
- # Create data loaders
46
- # reduce batch size 8 to 1
47
- train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, pin_memory=True)
48
- eval_dataloader = DataLoader(eval_dataset, batch_size=1, pin_memory=True)
49
 
50
- # Define training arguments
51
- training_args = TrainingArguments(
52
- output_dir="./results",
53
- eval_strategy="epoch",
54
- learning_rate=2e-5,
55
- per_device_train_batch_size=1,
56
- per_device_eval_batch_size=1,
57
- num_train_epochs=3,
58
- weight_decay=0.01,
59
- fp16=True, # Enable mixed precision
60
- # save_total_limit=2,
61
- logging_dir='./logs', # Set logging directory
62
- logging_steps=10, # Log more frequently
63
- gradient_checkpointing=True, # Enable gradient checkpointing
64
- gradient_accumulation_steps=8 # Accumulate gradients over 8
65
- )
66
 
67
- st.write("Training Started .....")
68
 
69
- # Create the Trainer
70
- trainer = Trainer(
71
- model=model,
72
- args=training_args,
73
- train_dataset=train_dataset,
74
- eval_dataset=eval_dataset,
75
- )
76
-
77
- try:
78
- trainer.train()
79
- except Exception as e:
80
- st.write(f"Error: {e}")
81
- traceback.print_exc()
82
- st.write("some error")
83
 
 
 
84
  # Evaluate the model
85
  st.write("Training Done ...")
 
 
 
86
  results = trainer.evaluate()
87
  st.write(results)
 
 
 
 
 
7
 
8
  dir_path = os.path.abspath('./')
9
  os.environ["HF_HOME"] = dir_path
 
10
 
11
 
12
  def tokenize_function(examples):
 
18
  return tokenized_inputs
19
 
20
 
21
+ st.write("Getting model and dataset ...")
22
+ # Load the dataset
23
+ dataset = load_dataset("viber1/indian-law-dataset", cache_dir=dir_path)
 
24
 
25
+ # Update this path based on where the tokenizer files are actually stored
26
+ tokenizer = AutoTokenizer.from_pretrained('gpt2')
27
+ tokenizer.pad_token = tokenizer.eos_token
28
+ # Load the model
29
+ model = AutoModelForCausalLM.from_pretrained('gpt2')
30
+ # model.gradient_checkpointing_enable()
31
 
32
+ st.write("Training setup ...")
33
+ # Apply the tokenizer to the dataset
34
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
35
 
36
+ # Split the dataset manually into train and validation sets
37
+ split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)
38
 
39
+ # Convert the dataset to PyTorch tensors
40
+ train_dataset = split_dataset["train"].with_format("torch")
41
+ eval_dataset = split_dataset["test"].with_format("torch")
42
 
43
+ # Create data loaders
44
+ # reduce batch size 8 to 1
45
+ train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, pin_memory=True)
46
+ eval_dataloader = DataLoader(eval_dataset, batch_size=8, pin_memory=True)
47
 
48
+ # Define training arguments
49
+ training_args = TrainingArguments(
50
+ output_dir="{dir_path}/results",
51
+ eval_strategy="epoch",
52
+ learning_rate=2e-5,
53
+ per_device_train_batch_size=8,
54
+ per_device_eval_batch_size=8,
55
+ num_train_epochs=1,
56
+ weight_decay=0.01,
57
+ fp16=True, # Enable mixed precision
58
+ # save_total_limit=2,
59
+ logging_dir='{dir_path}/logs', # Set logging directory
60
+ logging_steps=5, # Log more frequently
61
+ # gradient_checkpointing=True, # Enable gradient checkpointing
62
+ # gradient_accumulation_steps=8 # Accumulate gradients over 8
63
+ )
64
 
65
+ st.write("Training Started .....")
66
 
67
+ # Create the Trainer
68
+ trainer = Trainer(
69
+ model=model,
70
+ args=training_args,
71
+ train_dataset=train_dataset,
72
+ eval_dataset=eval_dataset,
73
+ )
 
 
 
 
 
 
 
74
 
75
+ try:
76
+ trainer.train()
77
  # Evaluate the model
78
  st.write("Training Done ...")
79
+ model.save_pretrained(f"{dir_path}\\trained-gpt2")
80
+ tokenizer.save_pretrained(f"{dir_path}\\trained-gpt2")
81
+ st.write("Evaluating Model ...")
82
  results = trainer.evaluate()
83
  st.write(results)
84
+ except Exception as e:
85
+ st.write(f"Error: {e}")
86
+ traceback.print_exc()
87
+ st.write("some error")