Failed to run on multiple device.
#4
by
hjewr
- opened
Hi, i try to run deberta for sequence classification.
here is my code
if __name__ == "__main__":
cfg = ConfigClass()
log_path = cfg.log_path
epochs = cfg.epochs
max_length = cfg.max_length
batch_size = cfg.batch_size
model_output = cfg.model_output
learning_rate = cfg.learning_rate
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
setup_loging(log_path)
file_path = ["data/reddit/train/train1.txt", "data/reddit/test/test1.txt"]
texts, labels = [], []
for path in file_path:
text, label = ProcessData(path)
texts.extend(text)
labels.extend(label)
config = DebertaV2Config.from_pretrained(
"microsoft/deberta-v2-xlarge", num_labels=5
)
model_name = "microsoft/deberta-v2-xlarge"
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
model = DebertaV2ForSequenceClassification.from_pretrained(model_name, num_labels=5)
dataset = CustomDataset(texts, labels, tokenizer, max_length)
train, test = torch.utils.data.random_split(
dataset, [int(0.8 * len(dataset)), len(dataset) - int(0.8 * len(dataset))]
)
train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=False)
logging.info(f"Train size: {len(train_loader.dataset)}")
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.MSELoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
logging.info(f"Let's use {torch.cuda.device_count()} GPUs!")
model = torch.nn.DataParallel(model)
else:
logging.info("Let's use 1 GPU!")
model.to(device)
for epoch in range(epochs):
model.train()
total_loss = 0
for batch in train_loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
print(input_ids.shape, attention_mask.shape, labels.shape)
optimizer.zero_grad()
outputs = model(
input_ids=input_ids, attention_mask=attention_mask, labels=labels
)
loss = outputs.loss
logits = outputs.logits
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_train_loss = total_loss / len(train_loader)
logging.info(
f"Epoch {epoch+1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}"
)
model.eval()
correct = 0
total = 0
for batch in test_loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
predicted = torch.argmax(logits, dim=1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print(f"Accuracy: {correct / total:.4f}")
model.save_pretrained(model_output)
and i got error like that
File "main.py", line 104, in <module>
outputs = model(
File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 184, in forward
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 189, in replicate
return replicate(module, device_ids, not torch.is_grad_enabled())
File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/replicate.py", line 110, in replicate
param_copies = _broadcast_coalesced_reshape(params, devices, detach)
File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/replicate.py", line 83, in _broadcast_coalesced_reshape
tensor_copies = Broadcast.apply(devices, *tensors)
File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/autograd/function.py", line 553, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/_functions.py", line 23, in forward
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/comm.py", line 57, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: NCCL Error 1: unhandled cuda error (run with NCCL_DEBUG=INFO for details)
I have tried with single GPU and it works.