reach-vb HF staff SixOpen commited on
Commit
4c4c78d
1 Parent(s): c360795

imatrix support (#80)

Browse files

- Imatrix support (349817ec391068d1ba939b87673b1a93884371b9)
- Imatrix (87a3f98b51bebef9bd5ece61549ca9358c00ff0d)
- Imatrix (70cc07f302c5c201546dac098b007428b8813282)
- Imatrix (a06efcaec53b01caa93c6e5704962763bf6e7506)


Co-authored-by: E <[email protected]>

Files changed (5) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +11 -4
  3. app.py +121 -25
  4. groups_merged.txt +0 -0
  5. start.sh +3 -2
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  llama.png filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  llama.png filter=lfs diff=lfs merge=lfs -text
37
+ imatrix_calibration.txt filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,4 +1,5 @@
1
- FROM python:3.9
 
2
  ENV DEBIAN_FRONTEND=noninteractive
3
  RUN apt-get update && \
4
  apt-get upgrade -y && \
@@ -21,8 +22,8 @@ RUN apt-get update && \
21
  libxmlsec1-dev \
22
  libffi-dev \
23
  liblzma-dev \
24
- # gradio dependencies \
25
- ffmpeg
26
 
27
  RUN useradd -m -u 1000 user
28
  USER user
@@ -43,6 +44,8 @@ COPY --chown=1000 . ${HOME}/app
43
  RUN git clone https://github.com/ggerganov/llama.cpp
44
  RUN pip install -r llama.cpp/requirements.txt
45
 
 
 
46
  ENV PYTHONPATH=${HOME}/app \
47
  PYTHONUNBUFFERED=1 \
48
  HF_HUB_ENABLE_HF_TRANSFER=1 \
@@ -52,6 +55,10 @@ ENV PYTHONPATH=${HOME}/app \
52
  GRADIO_THEME=huggingface \
53
  TQDM_POSITION=-1 \
54
  TQDM_MININTERVAL=1 \
55
- SYSTEM=spaces
 
 
 
56
 
57
  ENTRYPOINT /bin/sh start.sh
 
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
2
+
3
  ENV DEBIAN_FRONTEND=noninteractive
4
  RUN apt-get update && \
5
  apt-get upgrade -y && \
 
22
  libxmlsec1-dev \
23
  libffi-dev \
24
  liblzma-dev \
25
+ ffmpeg \
26
+ nvidia-driver-515
27
 
28
  RUN useradd -m -u 1000 user
29
  USER user
 
44
  RUN git clone https://github.com/ggerganov/llama.cpp
45
  RUN pip install -r llama.cpp/requirements.txt
46
 
47
+ COPY imatrix_calibration.txt ${HOME}/app/llama.cpp/
48
+
49
  ENV PYTHONPATH=${HOME}/app \
50
  PYTHONUNBUFFERED=1 \
51
  HF_HUB_ENABLE_HF_TRANSFER=1 \
 
55
  GRADIO_THEME=huggingface \
56
  TQDM_POSITION=-1 \
57
  TQDM_MININTERVAL=1 \
58
+ SYSTEM=spaces \
59
+ LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
60
+ PATH=/usr/local/nvidia/bin:${PATH}
61
+
62
 
63
  ENTRYPOINT /bin/sh start.sh
64
+
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import shutil
3
  import subprocess
 
4
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
5
  import gradio as gr
6
 
@@ -17,6 +18,35 @@ from textwrap import dedent
17
 
18
  HF_TOKEN = os.environ.get("HF_TOKEN")
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
21
  if oauth_token.token is None:
22
  raise ValueError("You have to be logged in.")
@@ -57,7 +87,7 @@ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, s
57
 
58
  print("Sharded model has been uploaded successfully!")
59
 
60
- def process_model(model_id, q_method, private_repo, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
61
  if oauth_token.token is None:
62
  raise ValueError("You must be logged in to use GGUF-my-repo")
63
  model_name = model_id.split('/')[-1]
@@ -96,18 +126,37 @@ def process_model(model_id, q_method, private_repo, split_model, split_max_tenso
96
  print("Model converted to fp16 successfully!")
97
  print(f"Converted model path: {fp16}")
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  username = whoami(oauth_token.token)["name"]
100
- quantized_gguf_name = f"{model_name.lower()}-{q_method.lower()}.gguf"
101
  quantized_gguf_path = quantized_gguf_name
102
- quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
 
 
 
103
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
104
  if result.returncode != 0:
105
  raise Exception(f"Error quantizing: {result.stderr}")
106
- print(f"Quantized successfully with {q_method} option!")
107
  print(f"Quantized model path: {quantized_gguf_path}")
108
 
109
  # Create empty repo
110
- new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
111
  new_repo_id = new_repo_url.repo_id
112
  print("Repo created successfully!", new_repo_url)
113
 
@@ -181,13 +230,26 @@ def process_model(model_id, q_method, private_repo, split_model, split_max_tenso
181
  )
182
  except Exception as e:
183
  raise Exception(f"Error uploading quantized model: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  api.upload_file(
186
  path_or_fileobj=f"README.md",
187
  path_in_repo=f"README.md",
188
  repo_id=new_repo_id,
189
  )
190
- print(f"Uploaded successfully with {q_method} option!")
191
 
192
  return (
193
  f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
@@ -201,58 +263,92 @@ def process_model(model_id, q_method, private_repo, split_model, split_max_tenso
201
 
202
 
203
  # Create Gradio interface
204
- with gr.Blocks() as demo:
205
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
206
  gr.LoginButton(min_width=250)
207
 
208
- model_id_input = HuggingfaceHubSearch(
209
  label="Hub Model ID",
210
  placeholder="Search for model id on Huggingface",
211
  search_type="model",
212
  )
213
 
214
- q_method_input = gr.Dropdown(
215
  ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
216
  label="Quantization Method",
217
  info="GGML quantization type",
218
  value="Q4_K_M",
219
- filterable=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  )
221
 
222
- private_repo_input = gr.Checkbox(
223
  value=False,
224
  label="Private Repo",
225
  info="Create a private repo under your username."
226
  )
227
 
228
- split_model_input = gr.Checkbox(
 
 
 
 
 
 
229
  value=False,
230
  label="Split Model",
231
  info="Shard the model using gguf-split."
232
  )
233
 
234
- split_max_tensors_input = gr.Number(
235
  value=256,
236
  label="Max Tensors per File",
237
  info="Maximum number of tensors per file when splitting model.",
238
  visible=False
239
  )
240
 
241
- split_max_size_input = gr.Textbox(
242
  label="Max File Size",
243
  info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
244
  visible=False
245
  )
246
 
 
 
 
 
 
 
 
 
 
247
  iface = gr.Interface(
248
  fn=process_model,
249
  inputs=[
250
- model_id_input,
251
- q_method_input,
252
- private_repo_input,
253
- split_model_input,
254
- split_max_tensors_input,
255
- split_max_size_input,
 
 
 
256
  ],
257
  outputs=[
258
  gr.Markdown(label="output"),
@@ -263,13 +359,13 @@ with gr.Blocks() as demo:
263
  api_name=False
264
  )
265
 
266
- def update_visibility(split_model):
267
  return gr.update(visible=split_model), gr.update(visible=split_model)
268
 
269
- split_model_input.change(
270
- fn=update_visibility,
271
- inputs=split_model_input,
272
- outputs=[split_max_tensors_input, split_max_size_input]
273
  )
274
 
275
  def restart_space():
 
1
  import os
2
  import shutil
3
  import subprocess
4
+ import signal
5
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
6
  import gradio as gr
7
 
 
18
 
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
 
21
+ def generate_importance_matrix(model_path, train_data_path):
22
+ imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
23
+
24
+ os.chdir("llama.cpp")
25
+
26
+ print(f"Current working directory: {os.getcwd()}")
27
+ print(f"Files in the current directory: {os.listdir('.')}")
28
+
29
+ if not os.path.isfile(f"../{model_path}"):
30
+ raise Exception(f"Model file not found: {model_path}")
31
+
32
+ print("Running imatrix command...")
33
+ process = subprocess.Popen(imatrix_command, shell=True)
34
+
35
+ try:
36
+ process.wait(timeout=60) # added wait
37
+ except subprocess.TimeoutExpired:
38
+ print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
39
+ process.send_signal(signal.SIGINT)
40
+ try:
41
+ process.wait(timeout=5) # grace period
42
+ except subprocess.TimeoutExpired:
43
+ print("Imatrix proc still didn't term. Forecfully terming process...")
44
+ process.kill()
45
+
46
+ os.chdir("..")
47
+
48
+ print("Importance matrix generation completed.")
49
+
50
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
51
  if oauth_token.token is None:
52
  raise ValueError("You have to be logged in.")
 
87
 
88
  print("Sharded model has been uploaded successfully!")
89
 
90
+ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
91
  if oauth_token.token is None:
92
  raise ValueError("You must be logged in to use GGUF-my-repo")
93
  model_name = model_id.split('/')[-1]
 
126
  print("Model converted to fp16 successfully!")
127
  print(f"Converted model path: {fp16}")
128
 
129
+ imatrix_path = "llama.cpp/imatrix.dat"
130
+
131
+ if use_imatrix:
132
+ if train_data_file:
133
+ train_data_path = train_data_file.name
134
+ else:
135
+ train_data_path = "groups_merged.txt" #fallback calibration dataset
136
+
137
+ print(f"Training data file path: {train_data_path}")
138
+
139
+ if not os.path.isfile(train_data_path):
140
+ raise Exception(f"Training data file not found: {train_data_path}")
141
+
142
+ generate_importance_matrix(fp16, train_data_path)
143
+ else:
144
+ print("Not using imatrix quantization.")
145
  username = whoami(oauth_token.token)["name"]
146
+ quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
147
  quantized_gguf_path = quantized_gguf_name
148
+ if use_imatrix:
149
+ quantise_ggml = f"./llama.cpp/quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
150
+ else:
151
+ quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
152
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
153
  if result.returncode != 0:
154
  raise Exception(f"Error quantizing: {result.stderr}")
155
+ print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
156
  print(f"Quantized model path: {quantized_gguf_path}")
157
 
158
  # Create empty repo
159
+ new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
160
  new_repo_id = new_repo_url.repo_id
161
  print("Repo created successfully!", new_repo_url)
162
 
 
230
  )
231
  except Exception as e:
232
  raise Exception(f"Error uploading quantized model: {e}")
233
+
234
+
235
+ imatrix_path = "llama.cpp/imatrix.dat"
236
+ if os.path.isfile(imatrix_path):
237
+ try:
238
+ print(f"Uploading imatrix.dat: {imatrix_path}")
239
+ api.upload_file(
240
+ path_or_fileobj=imatrix_path,
241
+ path_in_repo="imatrix.dat",
242
+ repo_id=new_repo_id,
243
+ )
244
+ except Exception as e:
245
+ raise Exception(f"Error uploading imatrix.dat: {e}")
246
 
247
  api.upload_file(
248
  path_or_fileobj=f"README.md",
249
  path_in_repo=f"README.md",
250
  repo_id=new_repo_id,
251
  )
252
+ print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
253
 
254
  return (
255
  f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
 
263
 
264
 
265
  # Create Gradio interface
266
+ with gr.Blocks(css=".gradio-container {max-height: 600px; overflow-y: auto;}") as demo:
267
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
268
  gr.LoginButton(min_width=250)
269
 
270
+ model_id = HuggingfaceHubSearch(
271
  label="Hub Model ID",
272
  placeholder="Search for model id on Huggingface",
273
  search_type="model",
274
  )
275
 
276
+ q_method = gr.Dropdown(
277
  ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
278
  label="Quantization Method",
279
  info="GGML quantization type",
280
  value="Q4_K_M",
281
+ filterable=False,
282
+ visible=True
283
+ )
284
+
285
+ imatrix_q_method = gr.Dropdown(
286
+ ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
287
+ label="Imatrix Quantization Method",
288
+ info="GGML imatrix quants type",
289
+ value="IQ4_NL",
290
+ filterable=False,
291
+ visible=False
292
+ )
293
+
294
+ use_imatrix = gr.Checkbox(
295
+ value=False,
296
+ label="Use Imatrix Quantization",
297
+ info="Use importance matrix for quantization."
298
  )
299
 
300
+ private_repo = gr.Checkbox(
301
  value=False,
302
  label="Private Repo",
303
  info="Create a private repo under your username."
304
  )
305
 
306
+ train_data_file = gr.File(
307
+ label="Training Data File",
308
+ file_types=["txt"],
309
+ visible=False
310
+ )
311
+
312
+ split_model = gr.Checkbox(
313
  value=False,
314
  label="Split Model",
315
  info="Shard the model using gguf-split."
316
  )
317
 
318
+ split_max_tensors = gr.Number(
319
  value=256,
320
  label="Max Tensors per File",
321
  info="Maximum number of tensors per file when splitting model.",
322
  visible=False
323
  )
324
 
325
+ split_max_size = gr.Textbox(
326
  label="Max File Size",
327
  info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
328
  visible=False
329
  )
330
 
331
+ def update_visibility(use_imatrix):
332
+ return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
333
+
334
+ use_imatrix.change(
335
+ fn=update_visibility,
336
+ inputs=use_imatrix,
337
+ outputs=[q_method, imatrix_q_method, train_data_file]
338
+ )
339
+
340
  iface = gr.Interface(
341
  fn=process_model,
342
  inputs=[
343
+ model_id,
344
+ q_method,
345
+ use_imatrix,
346
+ imatrix_q_method,
347
+ private_repo,
348
+ train_data_file,
349
+ split_model,
350
+ split_max_tensors,
351
+ split_max_size,
352
  ],
353
  outputs=[
354
  gr.Markdown(label="output"),
 
359
  api_name=False
360
  )
361
 
362
+ def update_split_visibility(split_model):
363
  return gr.update(visible=split_model), gr.update(visible=split_model)
364
 
365
+ split_model.change(
366
+ fn=update_split_visibility,
367
+ inputs=split_model,
368
+ outputs=[split_max_tensors, split_max_size]
369
  )
370
 
371
  def restart_space():
groups_merged.txt ADDED
The diff for this file is too large to render. See raw diff
 
start.sh CHANGED
@@ -1,4 +1,5 @@
1
  cd llama.cpp
2
- make -j quantize gguf-split
 
3
  cd ..
4
- python app.py
 
1
  cd llama.cpp
2
+ make -j quantize gguf-split imatrix
3
+
4
  cd ..
5
+ python app.py