Spaces:

chichomitko
/

sysprompt

Running on Zero

Ventsislav Muchinov commited on 29 days ago

Commit

de66c65

•

1 Parent(s): 9a47cf8

Upload app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import spaces
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
@@ -16,14 +16,17 @@ ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
 model_id = "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8"
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
-    device_map="auto",
     trust_remote_code=True,
     low_cpu_mem_usage=True,
-    token=ACCESS_TOKEN)
 tokenizer = AutoTokenizer.from_pretrained(
     model_id,
     trust_remote_code=True,
@@ -44,7 +47,7 @@ def generate(
         conversation.append({"role": "system", "content": system_prompt})
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")

 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 model_id = "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8"
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
+    device_map="cuda",
     trust_remote_code=True,
     low_cpu_mem_usage=True,
+    quantize_config=quantization_config
+    token=ACCESS_TOKEN).to("cuda")
 tokenizer = AutoTokenizer.from_pretrained(
     model_id,
     trust_remote_code=True,
         conversation.append({"role": "system", "content": system_prompt})
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to("cuda")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")