ShynBui commited on
Commit
4f12561
1 Parent(s): 6c98277

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -91
app.py CHANGED
@@ -1,91 +1,93 @@
1
- import time
2
- import torch
3
- from transformers import BertForSequenceClassification, AdamW
4
- from torch.utils.data import DataLoader, TensorDataset
5
- from transformers import BertTokenizer
6
- import gradio as gr
7
- import pandas as pd
8
- import os
9
- import spaces
10
-
11
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
- print(device)
13
-
14
- model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
15
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
16
- model.to(device)
17
-
18
- optimizer = AdamW(model.parameters(), lr=1e-5)
19
-
20
- global_data = None
21
-
22
- def load_data(file):
23
- global global_data
24
- df = pd.read_csv(file)
25
- inputs = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors="pt") # Mã hóa văn bản
26
- labels = torch.tensor(df['lable'].tolist()).long() #
27
- global_data = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
28
-
29
- print(global_data)
30
-
31
- def get_dataloader(start, end, batch_size=8):
32
- global global_data
33
- subset = torch.utils.data.Subset(global_data, range(start, end))
34
- return DataLoader(subset, batch_size=batch_size)
35
-
36
- @spaces.GPU(duration=120)
37
- def train_batch(dataloader):
38
- model.train()
39
- start_time = time.time()
40
-
41
- for step, batch in enumerate(dataloader):
42
- input_ids, attention_mask, labels = batch
43
- input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
44
-
45
- optimizer.zero_grad()
46
- outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
47
- loss = outputs.loss
48
- loss.backward()
49
- optimizer.step()
50
-
51
- elapsed_time = time.time() - start_time
52
- if elapsed_time > 60: # Dừng trước 60 giây để lưu checkpoint
53
- torch.save(model.state_dict(), "./checkpoint/model.pt")
54
- return False, "Checkpoint saved. Training paused."
55
-
56
- return True, "Batch training completed."
57
-
58
-
59
- def train_step(file=None):
60
- if file:
61
- load_data(file)
62
-
63
- start_idx = 0
64
- batch_size = 8
65
- total_samples = len(global_data)
66
-
67
- while start_idx < total_samples:
68
- end_idx = min(start_idx + (batch_size * 10), total_samples) # Chia nhỏ dữ liệu để xử lý nhanh
69
- dataloader = get_dataloader(start_idx, end_idx, batch_size)
70
-
71
- start_time = time.time()
72
- success, message = train_batch(dataloader)
73
- elapsed_time = time.time() - start_time
74
-
75
- if elapsed_time >= 100: # Kết thúc trước khi hết 60 giây để lưu checkpoint
76
- torch.save(model.state_dict(), "./checkpoint/model.pt")
77
- return f"{message}. Training paused after {elapsed_time:.2f}s."
78
-
79
- start_idx = end_idx
80
-
81
- torch.save(model.state_dict(), "./checkpoint/model.pt")
82
- return "Training completed and model saved."
83
-
84
-
85
- if __name__ == "__main__":
86
- iface = gr.Interface(
87
- fn=train_step,
88
- inputs=gr.File(label="Upload CSV"),
89
- outputs="text"
90
- )
91
- iface.launch()
 
 
 
1
+ import time
2
+ import torch
3
+ from transformers import BertForSequenceClassification, AdamW
4
+ from torch.utils.data import DataLoader, TensorDataset
5
+ from transformers import BertTokenizer
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import os
9
+ import spaces
10
+
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+ print(device)
13
+
14
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
15
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
16
+ model.to(device)
17
+
18
+ optimizer = AdamW(model.parameters(), lr=1e-5)
19
+
20
+ global_data = None
21
+
22
+ def load_data(file):
23
+ global global_data
24
+ df = pd.read_csv(file)
25
+ inputs = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors="pt") # Mã hóa văn bản
26
+ labels = torch.tensor(df['lable'].tolist()).long() #
27
+ global_data = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
28
+
29
+ print(global_data)
30
+
31
+ def get_dataloader(start, end, batch_size=8):
32
+ global global_data
33
+ subset = torch.utils.data.Subset(global_data, range(start, end))
34
+ return DataLoader(subset, batch_size=batch_size)
35
+
36
+ @spaces.GPU(duration=120)
37
+ def train_batch(dataloader):
38
+ model.train()
39
+ start_time = time.time()
40
+
41
+ for step, batch in enumerate(dataloader):
42
+ input_ids, attention_mask, labels = batch
43
+ input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
44
+
45
+ optimizer.zero_grad()
46
+ outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
47
+ loss = outputs.loss
48
+ loss.backward()
49
+ optimizer.step()
50
+
51
+ elapsed_time = time.time() - start_time
52
+ if elapsed_time > 10: # Dừng trước 60 giây để lưu checkpoint
53
+ print("save checkpoint")
54
+ torch.save(model.state_dict(), "./checkpoint/model.pt")
55
+ return False, "Checkpoint saved. Training paused."
56
+
57
+ return True, "Batch training completed."
58
+
59
+
60
+ def train_step(file=None):
61
+ if file:
62
+ load_data(file)
63
+
64
+ start_idx = 0
65
+ batch_size = 8
66
+ total_samples = len(global_data)
67
+
68
+ while start_idx < total_samples:
69
+ print(start_idx)
70
+ end_idx = min(start_idx + (batch_size * 10), total_samples) # Chia nhỏ dữ liệu để xử lý nhanh
71
+ dataloader = get_dataloader(start_idx, end_idx, batch_size)
72
+
73
+ start_time = time.time()
74
+ success, message = train_batch(dataloader)
75
+ elapsed_time = time.time() - start_time
76
+
77
+ if elapsed_time >= 10: # Kết thúc trước khi hết 60 giây để lưu checkpoint
78
+ torch.save(model.state_dict(), "./checkpoint/model.pt")
79
+ return f"{message}. Training paused after {elapsed_time:.2f}s."
80
+
81
+ start_idx = end_idx
82
+
83
+ torch.save(model.state_dict(), "./checkpoint/model.pt")
84
+ return "Training completed and model saved."
85
+
86
+
87
+ if __name__ == "__main__":
88
+ iface = gr.Interface(
89
+ fn=train_step,
90
+ inputs=gr.File(label="Upload CSV"),
91
+ outputs="text"
92
+ )
93
+ iface.launch()