Anthonyg5005
commited on
Commit
•
f3cac53
1
Parent(s):
8effb79
change up setups
Browse filesdidn't test linux but should work
auto-exl2-upload/auto-exl2-upload.zip
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d112e7bf1d8f4f6f42c961edb46f89f8356ec5265798b493f3d6b55e2c994376
|
3 |
+
size 8585
|
auto-exl2-upload/linux-setup.sh
CHANGED
@@ -40,7 +40,7 @@ fi
|
|
40 |
read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
|
41 |
|
42 |
# ask to install flash attention
|
43 |
-
echo "Flash attention is a feature that could fix overflow issues on some more broken models."
|
44 |
read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
|
45 |
if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
|
46 |
echo "Invalid input. Please enter y or n."
|
@@ -69,7 +69,6 @@ rm download-model.py
|
|
69 |
rm -rf exllamav2
|
70 |
rm start-quant.sh
|
71 |
rm enter-venv.sh
|
72 |
-
rm -rf flash-attention
|
73 |
|
74 |
# download stuff
|
75 |
echo "Downloading files"
|
@@ -87,13 +86,7 @@ venv/bin/python -m pip install -r exllamav2/requirements.txt
|
|
87 |
venv/bin/python -m pip install huggingface-hub transformers accelerate
|
88 |
venv/bin/python -m pip install ./exllamav2
|
89 |
|
90 |
-
|
91 |
-
echo "Installing flash-attention..."
|
92 |
-
echo "If failed, retry without flash-attention."
|
93 |
-
git clone https://github.com/Dao-AILab/flash-attention
|
94 |
-
venv/bin/python -m pip install ./flash-attention
|
95 |
-
rm -rf flash-attention
|
96 |
-
fi
|
97 |
|
98 |
# create start-quant.sh
|
99 |
echo "#!/bin/bash" > start-quant.sh
|
@@ -107,6 +100,15 @@ echo "#!/bin/bash" > enter-venv.sh
|
|
107 |
echo "bash --init-file venv/bin/activate" >> enter-venv.sh
|
108 |
chmod +x enter-venv.sh
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
|
111 |
echo "Environment setup complete. run start-quant.sh to start the quantization process."
|
112 |
read -p "Press enter to exit"
|
|
|
40 |
read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
|
41 |
|
42 |
# ask to install flash attention
|
43 |
+
echo "Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours."
|
44 |
read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
|
45 |
if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
|
46 |
echo "Invalid input. Please enter y or n."
|
|
|
69 |
rm -rf exllamav2
|
70 |
rm start-quant.sh
|
71 |
rm enter-venv.sh
|
|
|
72 |
|
73 |
# download stuff
|
74 |
echo "Downloading files"
|
|
|
86 |
venv/bin/python -m pip install huggingface-hub transformers accelerate
|
87 |
venv/bin/python -m pip install ./exllamav2
|
88 |
|
89 |
+
echo "Writing shell files..."
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
# create start-quant.sh
|
92 |
echo "#!/bin/bash" > start-quant.sh
|
|
|
100 |
echo "bash --init-file venv/bin/activate" >> enter-venv.sh
|
101 |
chmod +x enter-venv.sh
|
102 |
|
103 |
+
if [ "$flash_attention" = "y" ]; then
|
104 |
+
echo "Going to attempt to install flash attention but it isn't required."
|
105 |
+
echo "You may close now if you'd like and continue without flash attention."
|
106 |
+
read -p "Press enter to continue and install flash attention"
|
107 |
+
echo "Get some popcorn and watch a movie, this will take a while."
|
108 |
+
echo "Installing flash-attn..."
|
109 |
+
venv/bin/python -m pip install git+https://github.com/Dao-AILab/flash-attention.git
|
110 |
+
fi
|
111 |
+
|
112 |
echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
|
113 |
echo "Environment setup complete. run start-quant.sh to start the quantization process."
|
114 |
read -p "Press enter to exit"
|
auto-exl2-upload/windows-setup.bat
CHANGED
@@ -43,7 +43,7 @@ where nvcc
|
|
43 |
set /p cuda_version="Please enter your CUDA version (11 or 12): "
|
44 |
|
45 |
REM ask to install flash attention
|
46 |
-
echo Flash attention is a feature that could fix overflow issues on some more broken models
|
47 |
set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
|
48 |
if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
|
49 |
echo Invalid input. Please enter y or n.
|
@@ -69,7 +69,6 @@ del download-model.py
|
|
69 |
rmdir /s /q exllamav2
|
70 |
del start-quant.bat
|
71 |
del enter-venv.bat
|
72 |
-
rmdir /s /q flash-attention
|
73 |
|
74 |
REM download stuff
|
75 |
echo Downloading files...
|
@@ -87,13 +86,7 @@ venv\scripts\python.exe -m pip install -r exllamav2/requirements.txt
|
|
87 |
venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
|
88 |
venv\scripts\python.exe -m pip install .\exllamav2
|
89 |
|
90 |
-
|
91 |
-
echo Installing flash-attention. Go watch some movies, this will take a while...
|
92 |
-
echo If failed, retry without flash-attention.
|
93 |
-
git clone https://github.com/Dao-AILab/flash-attention
|
94 |
-
venv\scripts\python.exe -m pip install .\flash-attention
|
95 |
-
rmdir /s /q flash-attention
|
96 |
-
)
|
97 |
|
98 |
REM create start-quant-windows.bat
|
99 |
echo @echo off > start-quant.bat
|
@@ -106,6 +99,15 @@ REM create enter-venv.bat
|
|
106 |
echo @echo off > enter-venv.bat
|
107 |
echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
|
110 |
echo Environment setup complete. run start-quant.bat to start the quantization process.
|
111 |
pause
|
|
|
43 |
set /p cuda_version="Please enter your CUDA version (11 or 12): "
|
44 |
|
45 |
REM ask to install flash attention
|
46 |
+
echo Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours.
|
47 |
set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
|
48 |
if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
|
49 |
echo Invalid input. Please enter y or n.
|
|
|
69 |
rmdir /s /q exllamav2
|
70 |
del start-quant.bat
|
71 |
del enter-venv.bat
|
|
|
72 |
|
73 |
REM download stuff
|
74 |
echo Downloading files...
|
|
|
86 |
venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
|
87 |
venv\scripts\python.exe -m pip install .\exllamav2
|
88 |
|
89 |
+
echo Writing batch files...
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
REM create start-quant-windows.bat
|
92 |
echo @echo off > start-quant.bat
|
|
|
99 |
echo @echo off > enter-venv.bat
|
100 |
echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
|
101 |
|
102 |
+
if "%flash_attention%"=="y" (
|
103 |
+
echo Going to attempt to install flash attention but it isn't required.
|
104 |
+
echo You may close now if you'd like and continue without flash attention.
|
105 |
+
pause
|
106 |
+
echo Get some popcorn and watch a movie. This will take a while.
|
107 |
+
echo Installing flash-attn...
|
108 |
+
venv\scripts\python.exe -m pip install git+https://github.com/Dao-AILab/flash-attention.git
|
109 |
+
)
|
110 |
+
|
111 |
powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
|
112 |
echo Environment setup complete. run start-quant.bat to start the quantization process.
|
113 |
pause
|
exl2-multi-quant-local/exl2-multi-quant-local.zip
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96d89522925670652ab7ea1d6152a4e64c15302a940c9753a37345f2e9a06e58
|
3 |
+
size 7408
|
exl2-multi-quant-local/linux-setup.sh
CHANGED
@@ -40,7 +40,7 @@ fi
|
|
40 |
read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
|
41 |
|
42 |
# ask to install flash attention
|
43 |
-
echo "Flash attention is a feature that could fix overflow issues on some more broken models."
|
44 |
read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
|
45 |
if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
|
46 |
echo "Invalid input. Please enter y or n."
|
@@ -69,7 +69,6 @@ rm download-model.py
|
|
69 |
rm -rf exllamav2
|
70 |
rm start-quant.sh
|
71 |
rm enter-venv.sh
|
72 |
-
rm -rf flash-attention
|
73 |
|
74 |
# download stuff
|
75 |
echo "Downloading files"
|
@@ -87,13 +86,7 @@ venv/bin/python -m pip install -r exllamav2/requirements.txt
|
|
87 |
venv/bin/python -m pip install huggingface-hub transformers accelerate
|
88 |
venv/bin/python -m pip install ./exllamav2
|
89 |
|
90 |
-
|
91 |
-
echo "Installing flash-attention..."
|
92 |
-
echo "If failed, retry without flash-attention."
|
93 |
-
git clone https://github.com/Dao-AILab/flash-attention
|
94 |
-
venv/bin/python -m pip install ./flash-attention
|
95 |
-
rm -rf flash-attention
|
96 |
-
fi
|
97 |
|
98 |
# create start-quant.sh
|
99 |
echo "#!/bin/bash" > start-quant.sh
|
@@ -107,6 +100,15 @@ echo "#!/bin/bash" > enter-venv.sh
|
|
107 |
echo "bash --init-file venv/bin/activate" >> enter-venv.sh
|
108 |
chmod +x enter-venv.sh
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
|
111 |
echo "Environment setup complete. run start-quant.sh to start the quantization process."
|
112 |
read -p "Press enter to exit"
|
|
|
40 |
read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
|
41 |
|
42 |
# ask to install flash attention
|
43 |
+
echo "Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours."
|
44 |
read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
|
45 |
if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
|
46 |
echo "Invalid input. Please enter y or n."
|
|
|
69 |
rm -rf exllamav2
|
70 |
rm start-quant.sh
|
71 |
rm enter-venv.sh
|
|
|
72 |
|
73 |
# download stuff
|
74 |
echo "Downloading files"
|
|
|
86 |
venv/bin/python -m pip install huggingface-hub transformers accelerate
|
87 |
venv/bin/python -m pip install ./exllamav2
|
88 |
|
89 |
+
echo "Writing shell files..."
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
# create start-quant.sh
|
92 |
echo "#!/bin/bash" > start-quant.sh
|
|
|
100 |
echo "bash --init-file venv/bin/activate" >> enter-venv.sh
|
101 |
chmod +x enter-venv.sh
|
102 |
|
103 |
+
if [ "$flash_attention" = "y" ]; then
|
104 |
+
echo "Going to attempt to install flash attention but it isn't required."
|
105 |
+
echo "You may close now if you'd like and continue without flash attention."
|
106 |
+
read -p "Press enter to continue and install flash attention"
|
107 |
+
echo "Get some popcorn and watch a movie, this will take a while."
|
108 |
+
echo "Installing flash-attn..."
|
109 |
+
venv/bin/python -m pip install git+https://github.com/Dao-AILab/flash-attention.git
|
110 |
+
fi
|
111 |
+
|
112 |
echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
|
113 |
echo "Environment setup complete. run start-quant.sh to start the quantization process."
|
114 |
read -p "Press enter to exit"
|
exl2-multi-quant-local/windows-setup.bat
CHANGED
@@ -43,7 +43,7 @@ where nvcc
|
|
43 |
set /p cuda_version="Please enter your CUDA version (11 or 12): "
|
44 |
|
45 |
REM ask to install flash attention
|
46 |
-
echo Flash attention is a feature that could fix overflow issues on some more broken models
|
47 |
set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
|
48 |
if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
|
49 |
echo Invalid input. Please enter y or n.
|
@@ -69,7 +69,6 @@ del download-model.py
|
|
69 |
rmdir /s /q exllamav2
|
70 |
del start-quant.bat
|
71 |
del enter-venv.bat
|
72 |
-
rmdir /s /q flash-attention
|
73 |
|
74 |
REM download stuff
|
75 |
echo Downloading files...
|
@@ -87,13 +86,7 @@ venv\scripts\python.exe -m pip install -r exllamav2/requirements.txt
|
|
87 |
venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
|
88 |
venv\scripts\python.exe -m pip install .\exllamav2
|
89 |
|
90 |
-
|
91 |
-
echo Installing flash-attention. Go watch some movies, this will take a while...
|
92 |
-
echo If failed, retry without flash-attention.
|
93 |
-
git clone https://github.com/Dao-AILab/flash-attention
|
94 |
-
venv\scripts\python.exe -m pip install .\flash-attention
|
95 |
-
rmdir /s /q flash-attention
|
96 |
-
)
|
97 |
|
98 |
REM create start-quant-windows.bat
|
99 |
echo @echo off > start-quant.bat
|
@@ -106,6 +99,15 @@ REM create enter-venv.bat
|
|
106 |
echo @echo off > enter-venv.bat
|
107 |
echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
|
110 |
echo Environment setup complete. run start-quant.bat to start the quantization process.
|
111 |
pause
|
|
|
43 |
set /p cuda_version="Please enter your CUDA version (11 or 12): "
|
44 |
|
45 |
REM ask to install flash attention
|
46 |
+
echo Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours.
|
47 |
set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
|
48 |
if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
|
49 |
echo Invalid input. Please enter y or n.
|
|
|
69 |
rmdir /s /q exllamav2
|
70 |
del start-quant.bat
|
71 |
del enter-venv.bat
|
|
|
72 |
|
73 |
REM download stuff
|
74 |
echo Downloading files...
|
|
|
86 |
venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
|
87 |
venv\scripts\python.exe -m pip install .\exllamav2
|
88 |
|
89 |
+
echo Writing batch files...
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
REM create start-quant-windows.bat
|
92 |
echo @echo off > start-quant.bat
|
|
|
99 |
echo @echo off > enter-venv.bat
|
100 |
echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
|
101 |
|
102 |
+
if "%flash_attention%"=="y" (
|
103 |
+
echo Going to attempt to install flash attention but it isn't required.
|
104 |
+
echo You may close now if you'd like and continue without flash attention.
|
105 |
+
pause
|
106 |
+
echo Get some popcorn and watch a movie. This will take a while.
|
107 |
+
echo Installing flash-attn...
|
108 |
+
venv\scripts\python.exe -m pip install git+https://github.com/Dao-AILab/flash-attention.git
|
109 |
+
)
|
110 |
+
|
111 |
powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
|
112 |
echo Environment setup complete. run start-quant.bat to start the quantization process.
|
113 |
pause
|