Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Adding tabs for different set of Accelerate's features and content for large scale training features (#2)
Browse files- Upload 18 files (e5cadf9d99a28f3f5354729e0d2c8d93a90bd7e0)
Co-authored-by: Sourab Mangrulkar <[email protected]>
- code_samples/base/accelerate +17 -0
- code_samples/base/basic +31 -0
- code_samples/base/calculating_metrics +51 -0
- code_samples/base/checkpointing +29 -0
- code_samples/base/experiment_tracking +32 -0
- code_samples/base/gradient_accumulation +33 -0
- code_samples/base/initial +11 -0
- code_samples/base/initial_with_metrics +27 -0
- code_samples/large_scale_training/aws_sagemaker +77 -0
- code_samples/large_scale_training/deepspeed +101 -0
- code_samples/large_scale_training/megatron-lm +119 -0
- code_samples/large_scale_training/multi_gpu +60 -0
- code_samples/large_scale_training/multi_node_multi_gpu +89 -0
- code_samples/large_scale_training/pytorch_fsdp +80 -0
- src/app.py +112 -19
- src/markup.py +9 -7
- src/template.py +5 -6
code_samples/base/accelerate
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<pre>
|
2 |
+
from accelerate import Accelerator
|
3 |
+
accelerator = Accelerator()
|
4 |
+
train_dataloader, model, optimizer scheduler = accelerator.prepare(
|
5 |
+
dataloader, model, optimizer, scheduler
|
6 |
+
)
|
7 |
+
|
8 |
+
model.train()
|
9 |
+
for batch in train_dataloader:
|
10 |
+
optimizer.zero_grad()
|
11 |
+
inputs, targets = batch
|
12 |
+
outputs = model(inputs)
|
13 |
+
loss = loss_function(outputs, targets)
|
14 |
+
accelerator.backward(loss)
|
15 |
+
optimizer.step()
|
16 |
+
scheduler.step()
|
17 |
+
</pre>
|
code_samples/base/basic
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
<pre>
|
3 |
+
+from accelerate import Accelerator
|
4 |
+
+accelerator = Accelerator()
|
5 |
+
+dataloader, model, optimizer scheduler = accelerator.prepare(
|
6 |
+
+ dataloader, model, optimizer, scheduler
|
7 |
+
+)
|
8 |
+
|
9 |
+
for batch in dataloader:
|
10 |
+
optimizer.zero_grad()
|
11 |
+
inputs, targets = batch
|
12 |
+
- inputs = inputs.to(device)
|
13 |
+
- targets = targets.to(device)
|
14 |
+
outputs = model(inputs)
|
15 |
+
loss = loss_function(outputs, targets)
|
16 |
+
- loss.backward()
|
17 |
+
+ accelerator.backward(loss)
|
18 |
+
optimizer.step()
|
19 |
+
scheduler.step()</pre>
|
20 |
+
##
|
21 |
+
Everything around `accelerate` occurs with the `Accelerator` class. To use it, first make an object.
|
22 |
+
Then call `.prepare` passing in the PyTorch objects that you would normally train with. This will
|
23 |
+
return the same objects, but they will be on the correct device and distributed if needed. Then
|
24 |
+
you can train as normal, but instead of calling `loss.backward()` you call `accelerator.backward(loss)`.
|
25 |
+
Also note that you don't need to call `model.to(device)` or `inputs.to(device)` anymore, as this
|
26 |
+
is done automatically by `accelerator.prepare()`.
|
27 |
+
|
28 |
+
##
|
29 |
+
To learn more checkout the related documentation:
|
30 |
+
- <a href="https://huggingface.co/docs/accelerate/basic_tutorials/migration" target="_blank">Migrating to 🤗 Accelerate</a>
|
31 |
+
- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator" target="_blank">The Accelerator</a>
|
code_samples/base/calculating_metrics
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
<pre>
|
3 |
+
import evaluate
|
4 |
+
+from accelerate import Accelerator
|
5 |
+
+accelerator = Accelerator()
|
6 |
+
+train_dataloader, eval_dataloader, model, optimizer, scheduler = (
|
7 |
+
+ accelerator.prepare(
|
8 |
+
+ train_dataloader, eval_dataloader,
|
9 |
+
+ model, optimizer, scheduler
|
10 |
+
+ )
|
11 |
+
+)
|
12 |
+
metric = evaluate.load("accuracy")
|
13 |
+
for batch in train_dataloader:
|
14 |
+
optimizer.zero_grad()
|
15 |
+
inputs, targets = batch
|
16 |
+
- inputs = inputs.to(device)
|
17 |
+
- targets = targets.to(device)
|
18 |
+
outputs = model(inputs)
|
19 |
+
loss = loss_function(outputs, targets)
|
20 |
+
loss.backward()
|
21 |
+
optimizer.step()
|
22 |
+
scheduler.step()
|
23 |
+
|
24 |
+
model.eval()
|
25 |
+
for batch in eval_dataloader:
|
26 |
+
inputs, targets = batch
|
27 |
+
- inputs = inputs.to(device)
|
28 |
+
- targets = targets.to(device)
|
29 |
+
with torch.no_grad():
|
30 |
+
outputs = model(inputs)
|
31 |
+
predictions = outputs.argmax(dim=-1)
|
32 |
+
+ predictions, references = accelerator.gather_for_metrics(
|
33 |
+
+ (predictions, references)
|
34 |
+
+ )
|
35 |
+
metric.add_batch(
|
36 |
+
predictions = predictions,
|
37 |
+
references = references
|
38 |
+
)
|
39 |
+
print(metric.compute())</pre>
|
40 |
+
|
41 |
+
##
|
42 |
+
When calculating metrics on a validation set, you can use the `Accelerator.gather_for_metrics`
|
43 |
+
method to gather the predictions and references from all devices and then calculate the metric on the gathered values.
|
44 |
+
This will also *automatically* drop the padded values from the gathered tensors that were added to ensure
|
45 |
+
that all tensors have the same length. This ensures that the metric is calculated on the correct values.
|
46 |
+
##
|
47 |
+
To learn more checkout the related documentation:
|
48 |
+
|
49 |
+
- <a href="https://huggingface.co/docs/accelerate/en/quicktour#distributed-evaluation" target="_blank">Quicktour - Calculating metrics</a>
|
50 |
+
- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.gather_for_metrics" target="_blank">API reference</a>
|
51 |
+
- <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/multi_process_metrics.py" target="_blank">Example script</a>
|
code_samples/base/checkpointing
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
<pre>
|
3 |
+
from accelerate import Accelerator
|
4 |
+
accelerator = Accelerator()
|
5 |
+
dataloader, model, optimizer scheduler = accelerator.prepare(
|
6 |
+
dataloader, model, optimizer, scheduler
|
7 |
+
)
|
8 |
+
|
9 |
+
for batch in dataloader:
|
10 |
+
optimizer.zero_grad()
|
11 |
+
inputs, targets = batch
|
12 |
+
outputs = model(inputs)
|
13 |
+
loss = loss_function(outputs, targets)
|
14 |
+
accelerator.backward(loss)
|
15 |
+
optimizer.step()
|
16 |
+
scheduler.step()
|
17 |
+
+accelerator.save_state("checkpoint_dir")
|
18 |
+
+accelerator.load_state("checkpoint_dir")</pre>
|
19 |
+
##
|
20 |
+
To save or load a checkpoint in, `Accelerator` provides the `save_state` and `load_state` methods.
|
21 |
+
These methods will save or load the state of the model, optimizer, scheduler, as well as random states and
|
22 |
+
any custom registered objects from the main process on each device to a passed in folder.
|
23 |
+
**This API is designed to save and resume training states only from within the same python script or training setup.**
|
24 |
+
##
|
25 |
+
To learn more checkout the related documentation:
|
26 |
+
- <a href="https://huggingface.co/docs/accelerate/usage_guides/checkpoint" target="_blank">Saving and loading training states</a>
|
27 |
+
- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" target="_blank">`save_state` API reference</a>
|
28 |
+
- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.load_state" target="_blank">`load_state` API reference</a>
|
29 |
+
- <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/checkpointing.py" target="_blank">Example script</a>
|
code_samples/base/experiment_tracking
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
<pre>
|
3 |
+
from accelerate import Accelerator
|
4 |
+
-accelerator = Accelerator()
|
5 |
+
+accelerator = Accelerator(log_with="wandb")
|
6 |
+
train_dataloader, model, optimizer scheduler = accelerator.prepare(
|
7 |
+
dataloader, model, optimizer, scheduler
|
8 |
+
)
|
9 |
+
+accelerator.init_trackers()
|
10 |
+
model.train()
|
11 |
+
for batch in train_dataloader:
|
12 |
+
optimizer.zero_grad()
|
13 |
+
inputs, targets = batch
|
14 |
+
outputs = model(inputs)
|
15 |
+
loss = loss_function(outputs, targets)
|
16 |
+
+ accelerator.log({"loss":loss})
|
17 |
+
accelerator.backward(loss)
|
18 |
+
optimizer.step()
|
19 |
+
scheduler.step()
|
20 |
+
+accelerator.end_training()
|
21 |
+
</pre>
|
22 |
+
##
|
23 |
+
To use experiment trackers with `accelerate`, simply pass the desired tracker to the `log_with` parameter
|
24 |
+
when building the `Accelerator` object. Then initialize the tracker(s) by running `Accelerator.init_trackers()`
|
25 |
+
passing in any configurations they may need. Afterwards call `Accelerator.log` to log a particular value to your tracker.
|
26 |
+
At the end of training call `accelerator.end_training()` to call any finalization functions a tracking library
|
27 |
+
may need automatically.
|
28 |
+
##
|
29 |
+
To learn more checkout the related documentation:
|
30 |
+
- <a href="https://huggingface.co/docs/accelerate/usage_guides/tracking" target="_blank">Using experiment trackers</a>
|
31 |
+
- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.log" target="_blank">Accelerator API Reference</a>
|
32 |
+
- <a href="https://huggingface.co/docs/accelerate/package_reference/tracking" target="_blank">Tracking API Reference</a>
|
code_samples/base/gradient_accumulation
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
<pre>
|
3 |
+
from accelerate import Accelerator
|
4 |
+
accelerator = Accelerator(
|
5 |
+
+ gradient_accumulation_steps=2,
|
6 |
+
)
|
7 |
+
dataloader, model, optimizer scheduler = accelerator.prepare(
|
8 |
+
dataloader, model, optimizer, scheduler
|
9 |
+
)
|
10 |
+
|
11 |
+
for batch in dataloader:
|
12 |
+
+ with accelerator.accumulate(model):
|
13 |
+
optimizer.zero_grad()
|
14 |
+
inputs, targets = batch
|
15 |
+
outputs = model(inputs)
|
16 |
+
loss = loss_function(outputs, targets)
|
17 |
+
accelerator.backward(loss)
|
18 |
+
optimizer.step()
|
19 |
+
scheduler.step()</pre>
|
20 |
+
|
21 |
+
##
|
22 |
+
When performing gradient accumulation in a distributed setup, there are many opportunities for efficiency mistakes
|
23 |
+
to occur. `Accelerator` provides a context manager that will take care of the details for you and ensure that the
|
24 |
+
model is training correctly. Simply wrap the training loop in the `Accelerator.accumulate` context manager
|
25 |
+
while passing in the model you are training on and during training the gradients will accumulate and synchronize
|
26 |
+
automatically when needed.
|
27 |
+
|
28 |
+
##
|
29 |
+
To learn more checkout the related documentation:
|
30 |
+
- <a href="https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation" target="_blank">Performing gradient accumulation</a>
|
31 |
+
- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.accumulate" target="_blank">API reference</a>
|
32 |
+
- <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/gradient_accumulation.py" target="_blank">Example script</a>
|
33 |
+
- <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/automatic_gradient_accumulation.py" target="_blank">Performing automatic gradient accumulation example script</a>
|
code_samples/base/initial
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<pre>
|
2 |
+
for batch in dataloader:
|
3 |
+
optimizer.zero_grad()
|
4 |
+
inputs, targets = batch
|
5 |
+
inputs = inputs.to(device)
|
6 |
+
targets = targets.to(device)
|
7 |
+
outputs = model(inputs)
|
8 |
+
loss = loss_function(outputs, targets)
|
9 |
+
loss.backward()
|
10 |
+
optimizer.step()
|
11 |
+
scheduler.step()</pre>
|
code_samples/base/initial_with_metrics
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<pre>
|
2 |
+
import evaluate
|
3 |
+
metric = evaluate.load("accuracy")
|
4 |
+
for batch in train_dataloader:
|
5 |
+
optimizer.zero_grad()
|
6 |
+
inputs, targets = batch
|
7 |
+
inputs = inputs.to(device)
|
8 |
+
targets = targets.to(device)
|
9 |
+
outputs = model(inputs)
|
10 |
+
loss = loss_function(outputs, targets)
|
11 |
+
loss.backward()
|
12 |
+
optimizer.step()
|
13 |
+
scheduler.step()
|
14 |
+
|
15 |
+
model.eval()
|
16 |
+
for batch in eval_dataloader:
|
17 |
+
inputs, targets = batch
|
18 |
+
inputs = inputs.to(device)
|
19 |
+
targets = targets.to(device)
|
20 |
+
with torch.no_grad():
|
21 |
+
outputs = model(inputs)
|
22 |
+
predictions = outputs.argmax(dim=-1)
|
23 |
+
metric.add_batch(
|
24 |
+
predictions = predictions,
|
25 |
+
references = references
|
26 |
+
)
|
27 |
+
print(metric.compute())</pre>
|
code_samples/large_scale_training/aws_sagemaker
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
Run `accelerate config` on and answer the questionnaire accordingly.
|
3 |
+
Below is an example yaml for running code remotely on AWS SageMaker. Replace placeholder `xxxxx` with
|
4 |
+
appropriate values.
|
5 |
+
|
6 |
+
<pre>
|
7 |
+
base_job_name: accelerate-sagemaker-1
|
8 |
+
compute_environment: AMAZON_SAGEMAKER
|
9 |
+
distributed_type: 'NO'
|
10 |
+
dynamo_backend: 'NO'
|
11 |
+
ec2_instance_type: ml.p3.2xlarge
|
12 |
+
gpu_ids: all
|
13 |
+
iam_role_name: xxxxx
|
14 |
+
mixed_precision: 'no'
|
15 |
+
num_machines: 1
|
16 |
+
profile: xxxxx
|
17 |
+
py_version: py38
|
18 |
+
pytorch_version: 1.10.2
|
19 |
+
region: us-east-1
|
20 |
+
transformers_version: 4.17.0
|
21 |
+
use_cpu: false
|
22 |
+
</pre>
|
23 |
+
##
|
24 |
+
<pre>
|
25 |
+
from accelerate import Accelerator
|
26 |
+
|
27 |
+
def parse_args():
|
28 |
+
parser = argparse.ArgumentParser(description="sample task")
|
29 |
+
|
30 |
+
parser.add_argument(
|
31 |
+
"--pad_to_max_length",
|
32 |
+
- action="store_true",
|
33 |
+
+ type=bool,
|
34 |
+
+ default=False,
|
35 |
+
help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
|
36 |
+
)
|
37 |
+
|
38 |
+
...
|
39 |
+
|
40 |
+
|
41 |
+
+ def main():
|
42 |
+
accelerator = Accelerator()
|
43 |
+
|
44 |
+
model, optimizer, training_dataloader, scheduler = accelerator.prepare(
|
45 |
+
model, optimizer, training_dataloader, scheduler
|
46 |
+
)
|
47 |
+
|
48 |
+
for batch in training_dataloader:
|
49 |
+
optimizer.zero_grad()
|
50 |
+
inputs, targets = batch
|
51 |
+
outputs = model(inputs)
|
52 |
+
loss = loss_function(outputs, targets)
|
53 |
+
accelerator.backward(loss)
|
54 |
+
optimizer.step()
|
55 |
+
scheduler.step()
|
56 |
+
|
57 |
+
- torch.save('/opt/ml/model`)
|
58 |
+
+ accelerator.save('/opt/ml/model')
|
59 |
+
|
60 |
+
+ if __name__ == "__main__":
|
61 |
+
+ main()
|
62 |
+
</pre>
|
63 |
+
Launching a script using default accelerate config file looks like the following:
|
64 |
+
```
|
65 |
+
accelerate launch {script_name.py} {--arg1} {--arg2} ...
|
66 |
+
```
|
67 |
+
##
|
68 |
+
SageMaker doesn’t support argparse actions. If you want to use, for example, boolean hyperparameters, you need to specify type as bool in your script and provide an explicit True or False value for this hyperparameter. An example for the same is shown above for `pad_to_max_length` argument. Another important point is to save all the output artifacts to `/opt/ml/model` or use `os.environ["SM_MODEL_DIR"]` as your save directory. After training, artifacts in this directory are uploaded to S3, an example is shown in above code snippet.
|
69 |
+
|
70 |
+
You can provide custom docker image, input channels pointing to S3 data locations and use SageMaker metrics logging
|
71 |
+
as part of advanced features. Please refer <a href="https://github.com/huggingface/notebooks/tree/main/sagemaker/22_accelerate_sagemaker_examples" target="_blank">Examples showcasing AWS SageMaker integration of 🤗 Accelerate</a>
|
72 |
+
|
73 |
+
##
|
74 |
+
To learn more checkout the related documentation:
|
75 |
+
- <a href="https://huggingface.co/docs/accelerate/usage_guides/sagemaker" target="_blank">How to use 🤗 Accelerate with SageMaker</a>
|
76 |
+
- <a href="https://github.com/huggingface/notebooks/tree/main/sagemaker/22_accelerate_sagemaker_examples" target="_blank">Examples showcasing AWS SageMaker integration of 🤗 Accelerate</a>
|
77 |
+
- <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Accelerate CLI</a>
|
code_samples/large_scale_training/deepspeed
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
Run `accelerate config` and answer the questionnaire accordingly.
|
3 |
+
Below is an example yaml for mixed-precision training using DeepSpeed ZeRO Stage-3 with CPU offloading on 8 GPUs.
|
4 |
+
<pre>
|
5 |
+
compute_environment: LOCAL_MACHINE
|
6 |
+
deepspeed_config:
|
7 |
+
gradient_accumulation_steps: 1
|
8 |
+
gradient_clipping: 1.0
|
9 |
+
offload_optimizer_device: cpu
|
10 |
+
offload_param_device: cpu
|
11 |
+
zero3_init_flag: true
|
12 |
+
zero3_save_16bit_model: true
|
13 |
+
zero_stage: 3
|
14 |
+
distributed_type: DEEPSPEED
|
15 |
+
downcast_bf16: 'no'
|
16 |
+
dynamo_backend: 'NO'
|
17 |
+
fsdp_config: {}
|
18 |
+
machine_rank: 0
|
19 |
+
main_training_function: main
|
20 |
+
megatron_lm_config: {}
|
21 |
+
mixed_precision: fp16
|
22 |
+
num_machines: 1
|
23 |
+
num_processes: 8
|
24 |
+
rdzv_backend: static
|
25 |
+
same_network: true
|
26 |
+
use_cpu: false
|
27 |
+
</pre>
|
28 |
+
##
|
29 |
+
<pre>
|
30 |
+
from accelerate import Accelerator
|
31 |
+
|
32 |
+
+ def main():
|
33 |
+
accelerator = Accelerator()
|
34 |
+
|
35 |
+
model, optimizer, training_dataloader, scheduler = accelerator.prepare(
|
36 |
+
model, optimizer, training_dataloader, scheduler
|
37 |
+
)
|
38 |
+
|
39 |
+
for batch in training_dataloader:
|
40 |
+
optimizer.zero_grad()
|
41 |
+
inputs, targets = batch
|
42 |
+
outputs = model(inputs)
|
43 |
+
loss = loss_function(outputs, targets)
|
44 |
+
accelerator.backward(loss)
|
45 |
+
optimizer.step()
|
46 |
+
scheduler.step()
|
47 |
+
|
48 |
+
...
|
49 |
+
|
50 |
+
generated_tokens = accelerator.unwrap_model(model).generate(
|
51 |
+
batch["input_ids"],
|
52 |
+
attention_mask=batch["attention_mask"],
|
53 |
+
**gen_kwargs,
|
54 |
+
+ synced_gpus=True #required for ZeRO Stage 3
|
55 |
+
)
|
56 |
+
...
|
57 |
+
|
58 |
+
accelerator.unwrap_model(model).save_pretrained(
|
59 |
+
args.output_dir,
|
60 |
+
is_main_process=accelerator.is_main_process,
|
61 |
+
save_function=accelerator.save,
|
62 |
+
+ state_dict=accelerator.get_state_dict(model), #required for ZeRO Stage 3
|
63 |
+
)
|
64 |
+
|
65 |
+
...
|
66 |
+
|
67 |
+
+ if __name__ == "__main__":
|
68 |
+
+ main()
|
69 |
+
</pre>
|
70 |
+
|
71 |
+
Launching a script using default accelerate config file looks like the following:
|
72 |
+
```
|
73 |
+
accelerate launch {script_name.py} {--arg1} {--arg2} ...
|
74 |
+
```
|
75 |
+
|
76 |
+
Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
|
77 |
+
```
|
78 |
+
accelerate launch \
|
79 |
+
--use_deepspeed \
|
80 |
+
--num_processes=8 \
|
81 |
+
--mixed_precision=fp16 \
|
82 |
+
--zero_stage=3 \
|
83 |
+
--gradient_accumulation_steps=1 \
|
84 |
+
--gradient_clipping=1 \
|
85 |
+
--zero3_init_flag=True \
|
86 |
+
--zero3_save_16bit_model=True \
|
87 |
+
--offload_optimizer_device=cpu \
|
88 |
+
--offload_param_device=cpu \
|
89 |
+
{script_name.py} {--arg1} {--arg2} ...
|
90 |
+
```
|
91 |
+
|
92 |
+
##
|
93 |
+
For core DeepSpeed features supported via accelerate config file, no changes are required for ZeRO Stages 1 and 2. For ZeRO Stage-3, transformers' `generate` function requires `synced_gpus=True` and `save_pretrained` requires the `state_dict` param due to the fact that model parameters are sharded across the GPUs.
|
94 |
+
|
95 |
+
For advanced users who like granular control via DeepSpeed config file, it is supported wherein you can pass its loaction when running `accelerate config` command. You can also specify values of most of the fields in DeepSpeed config file as `auto` and they are filled automatically via the arguments of `accelerate launch` command and `accelerator.prepare` call thereby making life simple for users. Please refer docs on <a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed#deepspeed-config-file" target="_blank">DeepSpeed Config File</a>
|
96 |
+
|
97 |
+
##
|
98 |
+
To learn more checkout the related documentation:
|
99 |
+
- <a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed" target="_blank">How to use DeepSpeed</a>
|
100 |
+
- <a href="https://huggingface.co/blog/accelerate-deepspeed" target="_blank">Accelerate Large Model Training using DeepSpeed</a>
|
101 |
+
- <a href="https://huggingface.co/docs/accelerate/package_reference/deepspeed" target="_blank">DeepSpeed Utilities</a>
|
code_samples/large_scale_training/megatron-lm
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
Run `accelerate config` and answer the questionnaire accordingly.
|
3 |
+
Below is an example yaml for BF16 mixed-precision training using Megatron-LM with DPxTPxPP=2x2x2 degrees on 8 GPUs. (DP-Data Parallelism, PP-Pipeline Parallelism, TP-Tensor Parallelism). It is also using Sequence Parallelism and selective activation checkpointing along with sharded optimizer.
|
4 |
+
<pre>
|
5 |
+
compute_environment: LOCAL_MACHINE
|
6 |
+
deepspeed_config: {}
|
7 |
+
distributed_type: MEGATRON_LM
|
8 |
+
downcast_bf16: 'no'
|
9 |
+
dynamo_backend: 'NO'
|
10 |
+
fsdp_config: {}
|
11 |
+
machine_rank: 0
|
12 |
+
main_training_function: main
|
13 |
+
megatron_lm_config:
|
14 |
+
megatron_lm_gradient_clipping: 1.0
|
15 |
+
megatron_lm_num_micro_batches: 2
|
16 |
+
megatron_lm_pp_degree: 2
|
17 |
+
megatron_lm_recompute_activations: true
|
18 |
+
megatron_lm_sequence_parallelism: true
|
19 |
+
megatron_lm_tp_degree: 2
|
20 |
+
megatron_lm_use_distributed_optimizer: true
|
21 |
+
mixed_precision: bf16
|
22 |
+
num_machines: 1
|
23 |
+
num_processes: 8
|
24 |
+
rdzv_backend: static
|
25 |
+
same_network: true
|
26 |
+
use_cpu: false
|
27 |
+
</pre>
|
28 |
+
##
|
29 |
+
<pre>
|
30 |
+
from accelerate import Accelerator
|
31 |
+
|
32 |
+
+ def main():
|
33 |
+
accelerator = Accelerator()
|
34 |
+
|
35 |
+
...
|
36 |
+
|
37 |
+
- lr_scheduler = get_scheduler(
|
38 |
+
- name=args.lr_scheduler_type,
|
39 |
+
+ lr_scheduler = accelerate.utils.MegatronLMDummyScheduler(
|
40 |
+
optimizer=optimizer,
|
41 |
+
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
|
42 |
+
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
|
43 |
+
)
|
44 |
+
|
45 |
+
|
46 |
+
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
47 |
+
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
48 |
+
)
|
49 |
+
|
50 |
+
total_batch_size = (
|
51 |
+
- args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
52 |
+
+ accelerator.state.megatron_lm_plugin.global_batch_size
|
53 |
+
)
|
54 |
+
|
55 |
+
for batch in training_dataloader:
|
56 |
+
optimizer.zero_grad()
|
57 |
+
inputs, targets = batch
|
58 |
+
outputs = model(inputs)
|
59 |
+
loss = loss_function(outputs, targets)
|
60 |
+
accelerator.backward(loss)
|
61 |
+
optimizer.step()
|
62 |
+
scheduler.step()
|
63 |
+
|
64 |
+
...
|
65 |
+
|
66 |
+
# in eval loop
|
67 |
+
for step, batch in enumerate(eval_dataloader):
|
68 |
+
with torch.no_grad():
|
69 |
+
outputs = model(**batch)
|
70 |
+
loss = outputs.loss
|
71 |
+
- losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
|
72 |
+
+ losses.append(loss) # For Megatron-LM, the losses are already averaged across the data parallel group
|
73 |
+
- losses = torch.cat(losses)
|
74 |
+
+ losses = torch.tensor(losses)
|
75 |
+
eval_loss = torch.mean(losses)
|
76 |
+
perplexity = math.exp(eval_loss)
|
77 |
+
logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")
|
78 |
+
|
79 |
+
+ accelerator.save_state(output_dir)
|
80 |
+
|
81 |
+
+ if __name__ == "__main__":
|
82 |
+
+ main()
|
83 |
+
</pre>
|
84 |
+
|
85 |
+
Launching a script using default accelerate config file looks like the following:
|
86 |
+
```
|
87 |
+
accelerate launch {script_name.py} {--arg1} {--arg2} ...
|
88 |
+
```
|
89 |
+
|
90 |
+
Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
|
91 |
+
```
|
92 |
+
accelerate launch \
|
93 |
+
--use_megatron_lm \
|
94 |
+
--num_processes=8 \
|
95 |
+
--mixed_precision=bf16 \
|
96 |
+
--megatron_lm_tp_degree=2 \
|
97 |
+
--megatron_lm_pp_degree=2 \
|
98 |
+
--megatron_lm_num_micro_batches=2 \
|
99 |
+
--megatron_lm_sequence_parallelism=true \
|
100 |
+
--megatron_lm_recompute_activations=true \
|
101 |
+
--megatron_lm_use_distributed_optimizer=true \
|
102 |
+
{script_name.py} {--arg1} {--arg2} ...
|
103 |
+
```
|
104 |
+
|
105 |
+
##
|
106 |
+
For Megatron-LM, the supported models Transformers GPT2, Megatron-BERT and T5 models covering Decoder only, Encode only and Encoder-Decoder model classes. Given the complexity of the features of Megatron-LM, 4 changes that are required to get started are:
|
107 |
+
1. Using `accelerate.utils.MegatronLMDummyScheduler` as Megatron-LM uses its own implementation of Optimizer, the corresponding scheduler compatible with it needs to be used.
|
108 |
+
2. Getting the details of the total batch size now needs to be cognization of tensor and pipeline parallel sizes.
|
109 |
+
3. Losses are already averaged across the data parallel group
|
110 |
+
4. save the model using `accelerator.save_state` instead of transformers `from_pretrianed`
|
111 |
+
|
112 |
+
These changes have been highlited in the code snippet above.
|
113 |
+
|
114 |
+
Megatron-LM intergration supports many advanced features such as ability to leverage custom train step, using Megatron-LM indexed datasets, checkpoint reshaping and interoperabiloity utilities, `megatron_generate` function for text generation using Tensor and Pipeline Parallelism and support for ROPE/ALibi Positional embeddings and Multi-Query Attention. However, these require more changes owing to the complexity; worth it for getting the highest performance.
|
115 |
+
|
116 |
+
##
|
117 |
+
To learn more checkout the related documentation:
|
118 |
+
- <a href="https://huggingface.co/docs/accelerate/usage_guides/megatron_lm" target="_blank">How to use Megatron-LM</a>
|
119 |
+
- <a href="https://github.com/pacman100/accelerate-megatron-test" target="_blank">Examples showcasing the Megatron-LM integration of Accelerate</a>
|
code_samples/large_scale_training/multi_gpu
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
Run `accelerate config` and answer the questionnaire accordingly.
|
3 |
+
Below is an example yaml for using multi-gpu training with 4 GPUs.
|
4 |
+
<pre>
|
5 |
+
compute_environment: LOCAL_MACHINE
|
6 |
+
deepspeed_config: {}
|
7 |
+
distributed_type: MULTI_GPU
|
8 |
+
downcast_bf16: 'no'
|
9 |
+
dynamo_backend: 'NO'
|
10 |
+
fsdp_config: {}
|
11 |
+
gpu_ids: all
|
12 |
+
machine_rank: 0
|
13 |
+
main_training_function: main
|
14 |
+
megatron_lm_config: {}
|
15 |
+
mixed_precision: 'no'
|
16 |
+
num_machines: 1
|
17 |
+
num_processes: 4
|
18 |
+
rdzv_backend: static
|
19 |
+
same_network: true
|
20 |
+
use_cpu: false</pre>
|
21 |
+
##
|
22 |
+
<pre>
|
23 |
+
from accelerate import Accelerator
|
24 |
+
|
25 |
+
+ def main():
|
26 |
+
accelerator = Accelerator()
|
27 |
+
|
28 |
+
model, optimizer, training_dataloader, scheduler = accelerator.prepare(
|
29 |
+
model, optimizer, training_dataloader, scheduler
|
30 |
+
)
|
31 |
+
|
32 |
+
for batch in training_dataloader:
|
33 |
+
optimizer.zero_grad()
|
34 |
+
inputs, targets = batch
|
35 |
+
outputs = model(inputs)
|
36 |
+
loss = loss_function(outputs, targets)
|
37 |
+
accelerator.backward(loss)
|
38 |
+
optimizer.step()
|
39 |
+
scheduler.step()
|
40 |
+
|
41 |
+
+ if __name__ == "__main__":
|
42 |
+
+ main()
|
43 |
+
</pre>
|
44 |
+
|
45 |
+
Launching a script using default accelerate config file looks like the following:
|
46 |
+
```
|
47 |
+
accelerate launch {script_name.py} {--arg1} {--arg2} ...
|
48 |
+
```
|
49 |
+
|
50 |
+
Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
|
51 |
+
```
|
52 |
+
accelerate launch --multi_gpu --num_processes=4 {script_name.py} {--arg1} {--arg2} ...
|
53 |
+
```
|
54 |
+
|
55 |
+
##
|
56 |
+
Using this feature involves no changes to the code apart from the ones mentioned in the tab `Simplify your code and improve efficieny`.
|
57 |
+
##
|
58 |
+
To learn more checkout the related documentation:
|
59 |
+
- <a href="https://huggingface.co/docs/accelerate/main/en/basic_tutorials/launch" target="_blank">Launching distributed code</a>
|
60 |
+
- <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Accelerate CLI</a>
|
code_samples/large_scale_training/multi_node_multi_gpu
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
Run `accelerate config` on and answer the questionnaire accordingly.
|
3 |
+
Below is an example yaml for using multi-gpu training with 4 GPUs on 2 nodes/machines.
|
4 |
+
|
5 |
+
On Node/Machine 1:
|
6 |
+
<pre>
|
7 |
+
compute_environment: LOCAL_MACHINE
|
8 |
+
deepspeed_config: {}
|
9 |
+
distributed_type: MULTI_GPU
|
10 |
+
downcast_bf16: 'no'
|
11 |
+
dynamo_backend: 'NO'
|
12 |
+
fsdp_config: {}
|
13 |
+
gpu_ids: all
|
14 |
+
machine_rank: 0
|
15 |
+
main_process_ip: 192.168.20.1
|
16 |
+
main_process_port: 8080
|
17 |
+
main_training_function: main
|
18 |
+
megatron_lm_config: {}
|
19 |
+
mixed_precision: 'no'
|
20 |
+
num_machines: 2
|
21 |
+
num_processes: 8
|
22 |
+
rdzv_backend: static
|
23 |
+
same_network: true
|
24 |
+
use_cpu: false
|
25 |
+
</pre>
|
26 |
+
|
27 |
+
On Node/Machine 2:
|
28 |
+
<pre>
|
29 |
+
compute_environment: LOCAL_MACHINE
|
30 |
+
deepspeed_config: {}
|
31 |
+
distributed_type: MULTI_GPU
|
32 |
+
downcast_bf16: 'no'
|
33 |
+
dynamo_backend: 'NO'
|
34 |
+
fsdp_config: {}
|
35 |
+
gpu_ids: all
|
36 |
+
-machine_rank: 0
|
37 |
+
+machine_rank: 1
|
38 |
+
main_process_ip: 192.168.20.1
|
39 |
+
main_process_port: 8080
|
40 |
+
main_training_function: main
|
41 |
+
megatron_lm_config: {}
|
42 |
+
mixed_precision: 'no'
|
43 |
+
num_machines: 2
|
44 |
+
num_processes: 8
|
45 |
+
rdzv_backend: static
|
46 |
+
same_network: true
|
47 |
+
use_cpu: false
|
48 |
+
</pre>
|
49 |
+
##
|
50 |
+
<pre>
|
51 |
+
from accelerate import Accelerator
|
52 |
+
|
53 |
+
+ def main():
|
54 |
+
accelerator = Accelerator()
|
55 |
+
|
56 |
+
model, optimizer, training_dataloader, scheduler = accelerator.prepare(
|
57 |
+
model, optimizer, training_dataloader, scheduler
|
58 |
+
)
|
59 |
+
|
60 |
+
for batch in training_dataloader:
|
61 |
+
optimizer.zero_grad()
|
62 |
+
inputs, targets = batch
|
63 |
+
outputs = model(inputs)
|
64 |
+
loss = loss_function(outputs, targets)
|
65 |
+
accelerator.backward(loss)
|
66 |
+
optimizer.step()
|
67 |
+
scheduler.step()
|
68 |
+
|
69 |
+
+ if __name__ == "__main__":
|
70 |
+
+ main()
|
71 |
+
</pre>
|
72 |
+
|
73 |
+
Launching a script using default accelerate config file looks like the following:
|
74 |
+
```
|
75 |
+
accelerate launch {script_name.py} {--arg1} {--arg2} ...
|
76 |
+
```
|
77 |
+
|
78 |
+
Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below. Replace `{node_number}` with appropriate number.
|
79 |
+
```
|
80 |
+
accelerate launch --multi_gpu --num_machines=2 --num_processes=8 --main_process_ip="192.168.20.1" --main_process_port=8080
|
81 |
+
--machine_rank={node_number} {script_name.py} {--arg1} {--arg2} ...
|
82 |
+
```
|
83 |
+
|
84 |
+
##
|
85 |
+
Using this feature involves no changes to the code apart from the ones mentioned in the tab `Simplify your code and improve efficieny`.
|
86 |
+
##
|
87 |
+
To learn more checkout the related documentation:
|
88 |
+
- <a href="https://huggingface.co/docs/accelerate/main/en/basic_tutorials/launch" target="_blank">Launching distributed code</a>
|
89 |
+
- <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Accelerate CLI</a>
|
code_samples/large_scale_training/pytorch_fsdp
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
Run `accelerate config` and answer the questionnaire accordingly.
|
3 |
+
Below is an example yaml for BF16 mixed-precision training using PyTorch FSDP with CPU offloading on 8 GPUs.
|
4 |
+
<pre>
|
5 |
+
compute_environment: LOCAL_MACHINE
|
6 |
+
deepspeed_config: {}
|
7 |
+
distributed_type: FSDP
|
8 |
+
downcast_bf16: 'no'
|
9 |
+
dynamo_backend: 'NO'
|
10 |
+
fsdp_config:
|
11 |
+
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
12 |
+
fsdp_backward_prefetch_policy: BACKWARD_PRE
|
13 |
+
fsdp_offload_params: true
|
14 |
+
fsdp_sharding_strategy: 1
|
15 |
+
fsdp_state_dict_type: FULL_STATE_DICT
|
16 |
+
fsdp_transformer_layer_cls_to_wrap: T5Block
|
17 |
+
machine_rank: 0
|
18 |
+
main_training_function: main
|
19 |
+
megatron_lm_config: {}
|
20 |
+
mixed_precision: bf16
|
21 |
+
num_machines: 1
|
22 |
+
num_processes: 8
|
23 |
+
rdzv_backend: static
|
24 |
+
same_network: true
|
25 |
+
use_cpu: false
|
26 |
+
</pre>
|
27 |
+
##
|
28 |
+
<pre>
|
29 |
+
from accelerate import Accelerator
|
30 |
+
|
31 |
+
+ def main():
|
32 |
+
accelerator = Accelerator()
|
33 |
+
|
34 |
+
model = accelerator.prepare(model)
|
35 |
+
|
36 |
+
optimizer, training_dataloader, scheduler = accelerator.prepare(
|
37 |
+
optimizer, training_dataloader, scheduler
|
38 |
+
)
|
39 |
+
|
40 |
+
for batch in training_dataloader:
|
41 |
+
optimizer.zero_grad()
|
42 |
+
inputs, targets = batch
|
43 |
+
outputs = model(inputs)
|
44 |
+
loss = loss_function(outputs, targets)
|
45 |
+
accelerator.backward(loss)
|
46 |
+
optimizer.step()
|
47 |
+
scheduler.step()
|
48 |
+
|
49 |
+
...
|
50 |
+
|
51 |
+
+ if __name__ == "__main__":
|
52 |
+
+ main()
|
53 |
+
</pre>
|
54 |
+
|
55 |
+
Launching a script using default accelerate config file looks like the following:
|
56 |
+
```
|
57 |
+
accelerate launch {script_name.py} {--arg1} {--arg2} ...
|
58 |
+
```
|
59 |
+
|
60 |
+
Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
|
61 |
+
```
|
62 |
+
accelerate launch \
|
63 |
+
--use_fsdp \
|
64 |
+
--num_processes=8 \
|
65 |
+
--mixed_precision=bf16 \
|
66 |
+
--fsdp_sharding_strategy=1 \
|
67 |
+
--fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP \
|
68 |
+
--fsdp_transformer_layer_cls_to_wrap=T5Block \
|
69 |
+
--fsdp_offload_params=true \
|
70 |
+
{script_name.py} {--arg1} {--arg2} ...
|
71 |
+
```
|
72 |
+
|
73 |
+
##
|
74 |
+
For PyTorch FDSP, you need to prepare the model first before preparing the optimizer since FSDP will shard parameters in-place and this will break any previously initialized optimizers. Same in outlined in the above code snippet. For transformer models, please use `TRANSFORMER_BASED_WRAP` auto wrap policy as shown in the config above.
|
75 |
+
|
76 |
+
|
77 |
+
##
|
78 |
+
To learn more checkout the related documentation:
|
79 |
+
- <a href="https://huggingface.co/docs/accelerate/usage_guides/fsdp" target="_blank">How to use FSDP</a>
|
80 |
+
- <a href="https://huggingface.co/blog/pytorch-fsdp" target="_blank">Accelerate Large Model Training using PyTorch Fully Sharded Data Parallel</a>
|
src/app.py
CHANGED
@@ -4,47 +4,140 @@ from template import get_templates
|
|
4 |
|
5 |
templates = get_templates()
|
6 |
|
7 |
-
|
|
|
8 |
"""Based on an `inp`, render and highlight the appropriate code sample.
|
9 |
|
10 |
Args:
|
11 |
inp (`str`):
|
12 |
The input button from the interface.
|
|
|
|
|
13 |
|
14 |
Returns:
|
15 |
`tuple`: A tuple of the highlighted code diff, and the title for the section.
|
16 |
"""
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
default = change("Basic")
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
inp = gr.Radio(
|
29 |
-
["Basic", "Calculating Metrics", "Checkpointing", "Experiment Tracking", "Gradient Accumulation"],
|
30 |
label="Select a feature you would like to integrate",
|
31 |
-
value="Basic"
|
32 |
)
|
33 |
with gr.Row():
|
34 |
with gr.Column():
|
35 |
feature = gr.Markdown("## Accelerate Code")
|
36 |
out = gr.Markdown(default[0])
|
37 |
with gr.Row():
|
38 |
-
with gr.Column():
|
39 |
gr.Markdown("## Explanation")
|
40 |
explanation = gr.Markdown(default[2])
|
41 |
with gr.Row():
|
42 |
-
with gr.Column():
|
43 |
gr.Markdown("## Documentation Links")
|
44 |
docs = gr.Markdown(default[3])
|
45 |
-
inp.change(
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
49 |
)
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
templates = get_templates()
|
6 |
|
7 |
+
|
8 |
+
def change(inp, textbox):
|
9 |
"""Based on an `inp`, render and highlight the appropriate code sample.
|
10 |
|
11 |
Args:
|
12 |
inp (`str`):
|
13 |
The input button from the interface.
|
14 |
+
textbox (`str`):
|
15 |
+
The textbox specifying the tab name from the interface.
|
16 |
|
17 |
Returns:
|
18 |
`tuple`: A tuple of the highlighted code diff, and the title for the section.
|
19 |
"""
|
20 |
+
if textbox == "base":
|
21 |
+
code, explanation, docs = get_text(inp, textbox)
|
22 |
+
if inp == "Basic":
|
23 |
+
return (highlight(code), "## Accelerate Code (Base Integration)", explanation, docs)
|
24 |
+
elif inp == "Calculating Metrics":
|
25 |
+
return (highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
|
26 |
+
else:
|
27 |
+
return (highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
|
28 |
+
elif textbox == "large_scale_training":
|
29 |
+
config, code, explanation, docs = get_text(inp, textbox)
|
30 |
+
return (highlight(config), highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
|
31 |
|
|
|
32 |
|
33 |
+
default = change("Basic", "base")
|
34 |
+
|
35 |
+
|
36 |
+
def base_features(textbox):
|
37 |
+
# textbox.value = "base"
|
38 |
inp = gr.Radio(
|
39 |
+
["Basic", "Calculating Metrics", "Checkpointing", "Experiment Tracking", "Gradient Accumulation"],
|
40 |
label="Select a feature you would like to integrate",
|
41 |
+
value="Basic",
|
42 |
)
|
43 |
with gr.Row():
|
44 |
with gr.Column():
|
45 |
feature = gr.Markdown("## Accelerate Code")
|
46 |
out = gr.Markdown(default[0])
|
47 |
with gr.Row():
|
48 |
+
with gr.Column():
|
49 |
gr.Markdown("## Explanation")
|
50 |
explanation = gr.Markdown(default[2])
|
51 |
with gr.Row():
|
52 |
+
with gr.Column():
|
53 |
gr.Markdown("## Documentation Links")
|
54 |
docs = gr.Markdown(default[3])
|
55 |
+
inp.change(fn=change, inputs=[inp, textbox], outputs=[out, feature, explanation, docs])
|
56 |
+
|
57 |
+
|
58 |
+
def large_scale_training(textbox):
|
59 |
+
# textbox.value = "large_scale_training"
|
60 |
+
inp = gr.Radio(
|
61 |
+
["Multi GPU", "Multi Node Multi GPU", "AWS SageMaker", "DeepSpeed", "PyTorch FSDP", "Megatron-LM"],
|
62 |
+
label="Select a feature you would like to integrate",
|
63 |
+
value="Basic",
|
64 |
)
|
65 |
+
with gr.Row():
|
66 |
+
with gr.Column():
|
67 |
+
feature = gr.Markdown("## Accelerate Config")
|
68 |
+
config = gr.Markdown("")
|
69 |
+
with gr.Row():
|
70 |
+
with gr.Column():
|
71 |
+
feature = gr.Markdown("## Accelerate Code")
|
72 |
+
out = gr.Markdown("")
|
73 |
+
with gr.Row():
|
74 |
+
with gr.Column():
|
75 |
+
gr.Markdown("## Explanation")
|
76 |
+
explanation = gr.Markdown("")
|
77 |
+
with gr.Row():
|
78 |
+
with gr.Column():
|
79 |
+
gr.Markdown("## Documentation Links")
|
80 |
+
docs = gr.Markdown("")
|
81 |
+
inp.change(fn=change, inputs=[inp, textbox], outputs=[config, out, feature, explanation, docs])
|
82 |
+
|
83 |
+
|
84 |
+
# def big_model_inference():
|
85 |
+
# inp = gr.Radio(
|
86 |
+
# ["Accelerate's Big Model Inference",], # "DeepSpeed ZeRO Stage-3 Offload"
|
87 |
+
# label="Select a feature you would like to integrate",
|
88 |
+
# value="Basic",
|
89 |
+
# )
|
90 |
+
# with gr.Row():
|
91 |
+
# with gr.Column():
|
92 |
+
# feature = gr.Markdown("## Accelerate Code")
|
93 |
+
# out = gr.Markdown(default[0])
|
94 |
+
# with gr.Row():
|
95 |
+
# with gr.Column():
|
96 |
+
# gr.Markdown(default[1])
|
97 |
+
# explanation = gr.Markdown(default[2])
|
98 |
+
# with gr.Row():
|
99 |
+
# with gr.Column():
|
100 |
+
# gr.Markdown("## Documentation Links")
|
101 |
+
# docs = gr.Markdown(default[3])
|
102 |
+
# inp.change(fn=change, inputs=[inp, "big_model_inference"], outputs=[out, feature, explanation, docs])
|
103 |
+
|
104 |
+
|
105 |
+
# def notebook_launcher():
|
106 |
+
# inp = gr.Radio(
|
107 |
+
# ["Colab GPU", "Colab TPU", "Kaggle GPU", "Kaggle Multi GPU", "Kaggle TPU", "Multi GPU VMs"],
|
108 |
+
# label="Select a feature you would like to integrate",
|
109 |
+
# value="Basic",
|
110 |
+
# )
|
111 |
+
# with gr.Row():
|
112 |
+
# with gr.Column():
|
113 |
+
# feature = gr.Markdown("## Accelerate Code")
|
114 |
+
# out = gr.Markdown(default[0])
|
115 |
+
# with gr.Row():
|
116 |
+
# with gr.Column():
|
117 |
+
# gr.Markdown(default[1])
|
118 |
+
# explanation = gr.Markdown(default[2])
|
119 |
+
# with gr.Row():
|
120 |
+
# with gr.Column():
|
121 |
+
# gr.Markdown("## Documentation Links")
|
122 |
+
# docs = gr.Markdown(default[3])
|
123 |
+
# inp.change(fn=change, inputs=[inp, "notebook_launcher"], outputs=[out, feature, explanation, docs])
|
124 |
+
|
125 |
+
|
126 |
+
with gr.Blocks() as demo:
|
127 |
+
|
128 |
+
with gr.Tabs():
|
129 |
+
with gr.TabItem("Simplify your code and improve efficieny"):
|
130 |
+
textbox = gr.Textbox(label="tab_name", visible=False, value="base")
|
131 |
+
base_features(textbox)
|
132 |
+
with gr.TabItem("Large Scale Training"):
|
133 |
+
textbox = gr.Textbox(label="tab_name", visible=False, value="large_scale_training")
|
134 |
+
large_scale_training(textbox)
|
135 |
+
with gr.TabItem("Big Model Inference"):
|
136 |
+
# big_model_inference()
|
137 |
+
pass
|
138 |
+
with gr.TabItem("Notebook Launcher Intergation"):
|
139 |
+
# notebook_launcher()
|
140 |
+
pass
|
141 |
+
|
142 |
+
|
143 |
+
demo.launch()
|
src/markup.py
CHANGED
@@ -17,6 +17,7 @@ from template import get_filename
|
|
17 |
_remove_color = "rgb(103,6,12)"
|
18 |
_addition_color = "rgb(6,103,12)"
|
19 |
|
|
|
20 |
def mark_text(text, add=True):
|
21 |
"""Marks text with a highlight color for addition or removal.
|
22 |
|
@@ -35,7 +36,8 @@ def mark_text(text, add=True):
|
|
35 |
color = _remove_color
|
36 |
return f'<mark style="background-color:{color}!important;color:white!important">{text}</mark>'
|
37 |
|
38 |
-
|
|
|
39 |
"""Takes in code and returns the respective highlighted code sample.
|
40 |
|
41 |
Args:
|
@@ -43,7 +45,7 @@ def highlight(code:str):
|
|
43 |
Code from a file.
|
44 |
"""
|
45 |
lines = code.split("\n")
|
46 |
-
for i,line in enumerate(lines):
|
47 |
if line.startswith("-"):
|
48 |
lines[i] = "- " + line[1:]
|
49 |
lines[i] = mark_text(lines[i], False)
|
@@ -54,12 +56,12 @@ def highlight(code:str):
|
|
54 |
lines[i] = " " + line
|
55 |
return "\n".join(lines).rstrip()
|
56 |
|
57 |
-
|
|
|
58 |
"""
|
59 |
Reads in an option and returns the code, explanation, and documentation links
|
60 |
"""
|
61 |
-
filename = option.lower().replace(
|
62 |
-
with open(get_filename(filename)) as f:
|
63 |
output = f.read()
|
64 |
-
|
65 |
-
return code, explanation, doclink
|
|
|
17 |
_remove_color = "rgb(103,6,12)"
|
18 |
_addition_color = "rgb(6,103,12)"
|
19 |
|
20 |
+
|
21 |
def mark_text(text, add=True):
|
22 |
"""Marks text with a highlight color for addition or removal.
|
23 |
|
|
|
36 |
color = _remove_color
|
37 |
return f'<mark style="background-color:{color}!important;color:white!important">{text}</mark>'
|
38 |
|
39 |
+
|
40 |
+
def highlight(code: str):
|
41 |
"""Takes in code and returns the respective highlighted code sample.
|
42 |
|
43 |
Args:
|
|
|
45 |
Code from a file.
|
46 |
"""
|
47 |
lines = code.split("\n")
|
48 |
+
for i, line in enumerate(lines):
|
49 |
if line.startswith("-"):
|
50 |
lines[i] = "- " + line[1:]
|
51 |
lines[i] = mark_text(lines[i], False)
|
|
|
56 |
lines[i] = " " + line
|
57 |
return "\n".join(lines).rstrip()
|
58 |
|
59 |
+
|
60 |
+
def get_text(option, tab):
|
61 |
"""
|
62 |
Reads in an option and returns the code, explanation, and documentation links
|
63 |
"""
|
64 |
+
filename = option.lower().replace(" ", "_")
|
65 |
+
with open(get_filename(tab, filename)) as f:
|
66 |
output = f.read()
|
67 |
+
return output.split("##\n")[1:]
|
|
src/template.py
CHANGED
@@ -15,17 +15,16 @@ import os
|
|
15 |
|
16 |
TEMPLATES = ["initial", "initial_with_metrics", "accelerate"]
|
17 |
|
18 |
-
|
|
|
19 |
"""
|
20 |
Takes an template and returns the respective filename relative to the cwd.
|
21 |
"""
|
22 |
-
return os.path.join(os.getcwd(), "code_samples", template)
|
|
|
23 |
|
24 |
def get_templates() -> dict:
|
25 |
"""
|
26 |
Returns a dictionary of template type to code content
|
27 |
"""
|
28 |
-
return {
|
29 |
-
template: open(get_filename(template)).read()
|
30 |
-
for template in TEMPLATES
|
31 |
-
}
|
|
|
15 |
|
16 |
TEMPLATES = ["initial", "initial_with_metrics", "accelerate"]
|
17 |
|
18 |
+
|
19 |
+
def get_filename(tab: str, template: str) -> str:
|
20 |
"""
|
21 |
Takes an template and returns the respective filename relative to the cwd.
|
22 |
"""
|
23 |
+
return os.path.join(os.getcwd(), "code_samples", tab, template)
|
24 |
+
|
25 |
|
26 |
def get_templates() -> dict:
|
27 |
"""
|
28 |
Returns a dictionary of template type to code content
|
29 |
"""
|
30 |
+
return {template: open(get_filename("base", template)).read() for template in TEMPLATES}
|
|
|
|
|
|