Spaces:

chichomitko
/

sysprompt

Running on Zero

Ventsislav Muchinov commited on Oct 29

Commit

e8081aa

•

1 Parent(s): f74f77b

Upload app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,11 +12,11 @@ DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
-model_id = "hugging-quants/Meta-Llama-3.1-70B-Instruct-GPTQ-INT4"
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
-    device_map="cuda",
     trust_remote_code=True,
     token=ACCESS_TOKEN)
 tokenizer = AutoTokenizer.from_pretrained(
@@ -39,7 +39,7 @@ def generate(
         conversation.append({"role": "system", "content": system_prompt})
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to("cuda")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
@@ -101,7 +101,7 @@ chat_interface = gr.Interface(
             value=0.01,
         ),
     ],
-    title="Model testing",
     description="Provide system settings and a prompt to interact with the model.",
 )

 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
+    device_map="auto",
     trust_remote_code=True,
     token=ACCESS_TOKEN)
 tokenizer = AutoTokenizer.from_pretrained(
         conversation.append({"role": "system", "content": system_prompt})
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
             value=0.01,
         ),
     ],
+    title="Model testing - Meta-Llama-3-8B-Instruct",
     description="Provide system settings and a prompt to interact with the model.",
 )