reach-vb HF staff commited on
Commit
6d814c5
1 Parent(s): 00dc59f
Files changed (1) hide show
  1. app.py +6 -6
app.py CHANGED
@@ -19,7 +19,7 @@ from textwrap import dedent
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
 
21
  def generate_importance_matrix(model_path, train_data_path):
22
- imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
23
 
24
  os.chdir("llama.cpp")
25
 
@@ -146,9 +146,9 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
146
  quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
147
  quantized_gguf_path = quantized_gguf_name
148
  if use_imatrix:
149
- quantise_ggml = f"./llama.cpp/quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
150
  else:
151
- quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
152
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
153
  if result.returncode != 0:
154
  raise Exception(f"Error quantizing: {result.stderr}")
@@ -186,7 +186,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
186
 
187
  ### CLI:
188
  ```bash
189
- llama --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
190
  ```
191
 
192
  ### Server:
@@ -208,11 +208,11 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
208
 
209
  Step 3: Run inference through the main binary.
210
  ```
211
- ./main --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
212
  ```
213
  or
214
  ```
215
- ./server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
216
  ```
217
  """
218
  )
 
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
 
21
  def generate_importance_matrix(model_path, train_data_path):
22
+ imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
23
 
24
  os.chdir("llama.cpp")
25
 
 
146
  quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
147
  quantized_gguf_path = quantized_gguf_name
148
  if use_imatrix:
149
+ quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
150
  else:
151
+ quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
152
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
153
  if result.returncode != 0:
154
  raise Exception(f"Error quantizing: {result.stderr}")
 
186
 
187
  ### CLI:
188
  ```bash
189
+ llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
190
  ```
191
 
192
  ### Server:
 
208
 
209
  Step 3: Run inference through the main binary.
210
  ```
211
+ ./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
212
  ```
213
  or
214
  ```
215
+ ./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
216
  ```
217
  """
218
  )