Spaces:
Runtime error
Runtime error
Initial commit
Browse files- .gitignore +3 -0
- Dockerfile +34 -0
- README.md +5 -5
- train_llm.py +69 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.venv/
|
2 |
+
__pycache__/
|
3 |
+
.env
|
Dockerfile
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as a parent image
|
2 |
+
FROM python:3.11.1
|
3 |
+
|
4 |
+
# Set the working directory in the container
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Install poetry
|
8 |
+
# RUN pip3 install poetry==1.7.1
|
9 |
+
|
10 |
+
# Copy the current directory contents into the container at /usr/src/app
|
11 |
+
COPY . .
|
12 |
+
|
13 |
+
# Install dependencies
|
14 |
+
# RUN poetry config virtualenvs.create false \
|
15 |
+
# && poetry install --no-interaction --no-ansi
|
16 |
+
# Streamlit must be installed separately. Potentially this will cause an issue with dependencies in the future, but it's the only way it works.
|
17 |
+
# RUN pip3 install streamlit
|
18 |
+
|
19 |
+
# Install dependencies
|
20 |
+
RUN pip3 install -r requirements.txt
|
21 |
+
|
22 |
+
# Make a port available to the world outside this container
|
23 |
+
# The EXPOSE instruction informs Docker that the container listens on the specified network ports at runtime. Your container needs to listen to Streamlit’s (default) port 8501.
|
24 |
+
EXPOSE 8501
|
25 |
+
|
26 |
+
# The HEALTHCHECK instruction tells Docker how to test a container to check that it is still working. Your container needs to listen to Streamlit’s (default) port 8501:
|
27 |
+
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
28 |
+
|
29 |
+
# Run the command inside your image filesystem.
|
30 |
+
CMD ["python", "train_llm.py"]
|
31 |
+
|
32 |
+
# Execute with:
|
33 |
+
# docker build -t <image_name> .
|
34 |
+
# docker run -p 8501:8501 <image_name>
|
README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
---
|
2 |
title: Autotrain Playground
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
-
|
9 |
---
|
10 |
|
11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Autotrain Playground
|
3 |
+
emoji: 🚀
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: blue
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
+
app_port: 8501
|
9 |
---
|
10 |
|
11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
train_llm.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import jsonlines
|
3 |
+
from uuid import uuid4
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from datasets import load_dataset
|
7 |
+
import subprocess
|
8 |
+
from tqdm.notebook import tqdm
|
9 |
+
|
10 |
+
# from dotenv import load_dotenv,find_dotenv
|
11 |
+
# load_dotenv(find_dotenv(),override=True)
|
12 |
+
|
13 |
+
# Load dataset
|
14 |
+
dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
|
15 |
+
dataset=load_dataset(dataset_name)
|
16 |
+
|
17 |
+
# Write dataset files into data directory
|
18 |
+
data_directory = '../fine_tune_data/'
|
19 |
+
|
20 |
+
# Create the data directory if it doesn't exist
|
21 |
+
os.makedirs(data_directory, exist_ok=True)
|
22 |
+
|
23 |
+
# Write the train data to a CSV file
|
24 |
+
train_data='train_data.csv'
|
25 |
+
train_filename = os.path.join(data_directory, train_data)
|
26 |
+
dataset['train'].to_pandas().to_csv(train_filename, columns=['text'], index=False)
|
27 |
+
|
28 |
+
# Write the validation data to a CSV file
|
29 |
+
validation_data='validation_data.csv'
|
30 |
+
validation_filename = os.path.join(data_directory, validation_data)
|
31 |
+
dataset['validation'].to_pandas().to_csv(validation_filename, columns=['text'], index=False)
|
32 |
+
|
33 |
+
# Define project parameters
|
34 |
+
username='ai-aerospace'
|
35 |
+
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
|
36 |
+
repo_name='ams_data_train-100_'+str(uuid4())
|
37 |
+
|
38 |
+
model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
|
39 |
+
# model_name='mistralai/Mistral-7B-v0.1'
|
40 |
+
|
41 |
+
# Save parameters to environment variables
|
42 |
+
os.environ["project_name"] = project_name
|
43 |
+
os.environ["model_name"] = model_name
|
44 |
+
os.environ["repo_id"] = username+'/'+repo_name
|
45 |
+
os.environ["train_data"] = train_data
|
46 |
+
os.environ["validation_data"] = validation_data
|
47 |
+
|
48 |
+
# Set .venv and execute the autotrain script
|
49 |
+
# !autotrain llm --train --project_name my-llm --model TinyLlama/TinyLlama-1.1B-Chat-v0.1 --data_path . --use-peft --use_int4 --learning_rate 2e-4 --train_batch_size 6 --num_train_epochs 3 --trainer sft
|
50 |
+
# The training dataset to be used must be called training.csv and be located in the data_path folder.
|
51 |
+
command="""
|
52 |
+
source ../.venv/bin/activate && autotrain llm --train \
|
53 |
+
--project_name ${project_name} \
|
54 |
+
--model ${model_name} \
|
55 |
+
--data_path ../fine_tune_data \
|
56 |
+
--train_split ${train_data} \
|
57 |
+
--valid_split ${validation_data} \
|
58 |
+
--use-peft \
|
59 |
+
--learning_rate 2e-4 \
|
60 |
+
--train_batch_size 6 \
|
61 |
+
--num_train_epochs 3 \
|
62 |
+
--trainer sft \
|
63 |
+
--push_to_hub \
|
64 |
+
--repo_id ${repo_id} \
|
65 |
+
--token $HUGGINGFACE_TOKEN
|
66 |
+
"""
|
67 |
+
|
68 |
+
# Use subprocess.run() to execute the command
|
69 |
+
subprocess.run(command, shell=True, check=True, env=os.environ)
|