File size: 6,169 Bytes
6be08fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Sample YAML file for configuration.
# Comment and uncomment values as needed. Every value has a default within the application.
# This file serves to be a drop in for config.yml

# Unless specified in the comments, DO NOT put these options in quotes!
# You can use https://www.yamllint.com/ if you want to check your YAML formatting.

# Options for networking
network:
  # The IP to host on (default: 127.0.0.1).
  # Use 0.0.0.0 to expose on all network adapters
  host: 0.0.0.0

  # The port to host on (default: 5000)
  port: 5000

  # Disable HTTP token authenticaion with requests
  # WARNING: This will make your instance vulnerable!
  # Turn on this option if you are ONLY connecting from localhost
  disable_auth: False

# Options for logging
logging:
  # Enable prompt logging (default: False)
  prompt: False

  # Enable generation parameter logging (default: False)
  generation_params: False

# Options for sampling
sampling:
  # Override preset name. Find this in the sampler-overrides folder (default: None)
  # This overrides default fallbacks for sampler values that are passed to the API
  # Server-side overrides are NOT needed by default
  # WARNING: Using this can result in a generation speed penalty
  #override_preset: 

# Options for development and experimentation
developer:
  # Skips exllamav2 version check (default: False)
  # It's highly recommended to update your dependencies rather than enabling this flag
  # WARNING: Don't set this unless you know what you're doing!
  #unsafe_launch: False

  # Disable all request streaming (default: False)
  # A kill switch for turning off SSE in the API server
  #disable_request_streaming: False

  # Enable the torch CUDA malloc backend (default: False)
  # This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk.
  cuda_malloc_backend: True

# Options for model overrides and loading
model:
  # Overrides the directory to look for models (default: models)
  # Windows users, DO NOT put this path in quotes! This directory will be invalid otherwise.
  model_dir: models

  # An initial model to load. Make sure the model is located in the model directory!
  # A model can be loaded later via the API.
  # REQUIRED: This must be filled out to load a model on startup!
  model_name: Tess-v2.5.2-Qwen2-72B-safetensors_exl2_5.0bpw
  # Sends dummy model names when the models endpoint is queried
  # Enable this if the program is looking for a specific OAI model
  #use_dummy_models: False

  # The below parameters apply only if model_name is set

  # Max sequence length (default: Empty)
  # Fetched from the model's base sequence length in config.json by default
  max_seq_len: 19968
  # Overrides base model context length (default: Empty)
  # WARNING: Don't set this unless you know what you're doing!
  # Again, do NOT use this for configuring context length, use max_seq_len above ^
  # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B)
  #override_base_seq_len:

  # Automatically allocate resources to GPUs (default: True)
  # NOTE: Not parsed for single GPU users
  gpu_split_auto: True

  # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0)
  # This is represented as an array of MB per GPU used
  autosplit_reserve: [6]

  # An integer array of GBs of vram to split between GPUs (default: [])
  # NOTE: Not parsed for single GPU users
  #gpu_split: [20.6, 24]

  # Rope scale (default: 1.0)
  # Same thing as compress_pos_emb
  # Only use if your model was trained on long context with rope (check config.json)
  # Leave blank to pull the value from the model
  #rope_scale: 1.0

  # Rope alpha (default: 1.0)
  # Same thing as alpha_value
  # Leave blank to automatically calculate alpha
  #rope_alpha: 1.0

  # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)
  #no_flash_attention: False

  # Enable different cache modes for VRAM savings (slight performance hit).
  # Possible values FP16, FP8, Q4. (default: FP16)
  cache_mode: Q4

  # Chunk size for prompt ingestion. A lower value reduces VRAM usage at the cost of ingestion speed (default: 2048)
  # NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096
  chunk_size: 2048

  # Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None)
  # If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name
  # of the template you want to use.s
  # NOTE: Only works with chat completion message lists!
  #prompt_template:

  # Number of experts to use PER TOKEN. Fetched from the model's config.json if not specified (default: Empty)
  # WARNING: Don't set this unless you know what you're doing!
  # NOTE: For MoE models (ex. Mixtral) only!
  #num_experts_per_token:

  # Enables CFG support (default: False)
  # WARNING: This flag disables Flash Attention! (a stopgap fix until it's fixed in upstream)
  #use_cfg: False

  # Enables fasttensors to possibly increase model loading speeds (default: False)
  #fasttensors: true

  # Options for draft models (speculative decoding). This will use more VRAM!
  #draft:
    # Overrides the directory to look for draft (default: models)
    #draft_model_dir: models

    # An initial draft model to load. Make sure this model is located in the model directory!
    # A draft model can be loaded later via the API.
    #draft_model_name: A model name
  
    # Rope scale for draft models (default: 1.0)
    # Same thing as compress_pos_emb
    # Only use if your draft model was trained on long context with rope (check config.json)
    #draft_rope_scale: 1.0

    # Rope alpha for draft model (default: 1.0)
    # Same thing as alpha_value
    # Leave blank to automatically calculate alpha value
    #draft_rope_alpha: 1.0
  
  # Options for loras
  #lora:
    # Overrides the directory to look for loras (default: loras)
    #lora_dir: loras
    
    # List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.
    #loras:
    #- name: lora1
    #  scaling: 1.0