Spaces:
Sleeping
Sleeping
owaiskha9654
commited on
Commit
•
1ddbf73
1
Parent(s):
c28cc64
add aws
Browse files- utils/aws/__init__.py +1 -0
- utils/aws/mime.sh +26 -0
- utils/aws/resume.py +37 -0
- utils/aws/userdata.sh +27 -0
utils/aws/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
#init
|
utils/aws/mime.sh
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AWS EC2 instance startup 'MIME' script https://aws.amazon.com/premiumsupport/knowledge-center/execute-user-data-ec2/
|
2 |
+
# This script will run on every instance restart, not only on first start
|
3 |
+
# --- DO NOT COPY ABOVE COMMENTS WHEN PASTING INTO USERDATA ---
|
4 |
+
|
5 |
+
Content-Type: multipart/mixed; boundary="//"
|
6 |
+
MIME-Version: 1.0
|
7 |
+
|
8 |
+
--//
|
9 |
+
Content-Type: text/cloud-config; charset="us-ascii"
|
10 |
+
MIME-Version: 1.0
|
11 |
+
Content-Transfer-Encoding: 7bit
|
12 |
+
Content-Disposition: attachment; filename="cloud-config.txt"
|
13 |
+
|
14 |
+
#cloud-config
|
15 |
+
cloud_final_modules:
|
16 |
+
- [scripts-user, always]
|
17 |
+
|
18 |
+
--//
|
19 |
+
Content-Type: text/x-shellscript; charset="us-ascii"
|
20 |
+
MIME-Version: 1.0
|
21 |
+
Content-Transfer-Encoding: 7bit
|
22 |
+
Content-Disposition: attachment; filename="userdata.txt"
|
23 |
+
|
24 |
+
#!/bin/bash
|
25 |
+
# --- paste contents of userdata.sh here ---
|
26 |
+
--//
|
utils/aws/resume.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Resume all interrupted trainings in yolor/ dir including DDP trainings
|
2 |
+
# Usage: $ python utils/aws/resume.py
|
3 |
+
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
import torch
|
9 |
+
import yaml
|
10 |
+
|
11 |
+
sys.path.append('./') # to run '$ python *.py' files in subdirectories
|
12 |
+
|
13 |
+
port = 0 # --master_port
|
14 |
+
path = Path('').resolve()
|
15 |
+
for last in path.rglob('*/**/last.pt'):
|
16 |
+
ckpt = torch.load(last)
|
17 |
+
if ckpt['optimizer'] is None:
|
18 |
+
continue
|
19 |
+
|
20 |
+
# Load opt.yaml
|
21 |
+
with open(last.parent.parent / 'opt.yaml') as f:
|
22 |
+
opt = yaml.load(f, Loader=yaml.SafeLoader)
|
23 |
+
|
24 |
+
# Get device count
|
25 |
+
d = opt['device'].split(',') # devices
|
26 |
+
nd = len(d) # number of devices
|
27 |
+
ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel
|
28 |
+
|
29 |
+
if ddp: # multi-GPU
|
30 |
+
port += 1
|
31 |
+
cmd = f'python -m torch.distributed.launch --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
|
32 |
+
else: # single-GPU
|
33 |
+
cmd = f'python train.py --resume {last}'
|
34 |
+
|
35 |
+
cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread
|
36 |
+
print(cmd)
|
37 |
+
os.system(cmd)
|
utils/aws/userdata.sh
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# AWS EC2 instance startup script https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html
|
3 |
+
# This script will run only once on first instance start (for a re-start script see mime.sh)
|
4 |
+
# /home/ubuntu (ubuntu) or /home/ec2-user (amazon-linux) is working dir
|
5 |
+
# Use >300 GB SSD
|
6 |
+
|
7 |
+
cd home/ubuntu
|
8 |
+
if [ ! -d yolor ]; then
|
9 |
+
echo "Running first-time script." # install dependencies, download COCO, pull Docker
|
10 |
+
git clone -b paper https://github.com/WongKinYiu/yolor && sudo chmod -R 777 yolor
|
11 |
+
cd yolor
|
12 |
+
bash data/scripts/get_coco.sh && echo "Data done." &
|
13 |
+
sudo docker pull nvcr.io/nvidia/pytorch:21.08-py3 && echo "Docker done." &
|
14 |
+
python -m pip install --upgrade pip && pip install -r requirements.txt && python detect.py && echo "Requirements done." &
|
15 |
+
wait && echo "All tasks done." # finish background tasks
|
16 |
+
else
|
17 |
+
echo "Running re-start script." # resume interrupted runs
|
18 |
+
i=0
|
19 |
+
list=$(sudo docker ps -qa) # container list i.e. $'one\ntwo\nthree\nfour'
|
20 |
+
while IFS= read -r id; do
|
21 |
+
((i++))
|
22 |
+
echo "restarting container $i: $id"
|
23 |
+
sudo docker start $id
|
24 |
+
# sudo docker exec -it $id python train.py --resume # single-GPU
|
25 |
+
sudo docker exec -d $id python utils/aws/resume.py # multi-scenario
|
26 |
+
done <<<"$list"
|
27 |
+
fi
|