Failed to run on multiple device.

#4
by hjewr - opened

Hi, i try to run deberta for sequence classification.
here is my code

if __name__ == "__main__":
    cfg = ConfigClass()
    log_path = cfg.log_path
    epochs = cfg.epochs
    max_length = cfg.max_length
    batch_size = cfg.batch_size
    model_output = cfg.model_output
    learning_rate = cfg.learning_rate

    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

    setup_loging(log_path)

    file_path = ["data/reddit/train/train1.txt", "data/reddit/test/test1.txt"]
    texts, labels = [], []
    for path in file_path:
        text, label = ProcessData(path)
        texts.extend(text)
        labels.extend(label)

    config = DebertaV2Config.from_pretrained(
        "microsoft/deberta-v2-xlarge", num_labels=5
    )
    model_name = "microsoft/deberta-v2-xlarge"
    tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
    model = DebertaV2ForSequenceClassification.from_pretrained(model_name, num_labels=5)

    dataset = CustomDataset(texts, labels, tokenizer, max_length)
    train, test = torch.utils.data.random_split(
        dataset, [int(0.8 * len(dataset)), len(dataset) - int(0.8 * len(dataset))]
    )
    train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test, batch_size=batch_size, shuffle=False)
    logging.info(f"Train size: {len(train_loader.dataset)}")

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = torch.nn.MSELoss()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if torch.cuda.device_count() > 1:
        logging.info(f"Let's use {torch.cuda.device_count()} GPUs!")
        model = torch.nn.DataParallel(model)
    else:
        logging.info("Let's use 1 GPU!")

    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            print(input_ids.shape, attention_mask.shape, labels.shape)

            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        logging.info(
            f"Epoch {epoch+1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}"
        )

        model.eval()
        correct = 0
        total = 0
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                predicted = torch.argmax(logits, dim=1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f"Accuracy: {correct / total:.4f}")

    model.save_pretrained(model_output)

and i got error like that

  File "main.py", line 104, in <module>
    outputs = model(
  File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 184, in forward
    replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
  File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 189, in replicate
    return replicate(module, device_ids, not torch.is_grad_enabled())
  File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/replicate.py", line 110, in replicate
    param_copies = _broadcast_coalesced_reshape(params, devices, detach)
  File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/replicate.py", line 83, in _broadcast_coalesced_reshape
    tensor_copies = Broadcast.apply(devices, *tensors)
  File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/autograd/function.py", line 553, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/_functions.py", line 23, in forward
    outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
  File "/opt/conda/envs/llama3/lib/python3.8/site-packages/torch/nn/parallel/comm.py", line 57, in broadcast_coalesced
    return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: NCCL Error 1: unhandled cuda error (run with NCCL_DEBUG=INFO for details)

I have tried with single GPU and it works.

Sign up or log in to comment